Spaces:

ibm-research
/

cuga-agent

Running

cuga-agent / src /cuga /backend /activity_tracker /tracker.py

Sami Marreed

feat: docker-v1 with optimized frontend

0646b18 about 5 hours ago

57.2 kB

	import copy
	import json
	import os
	import shutil
	from datetime import datetime
	from typing import Any, Dict, List, Optional
	import time

	import pandas as pd


	from cuga.backend.cuga_graph.nodes.api.code_agent.model import CodeAgentOutput

	from cuga.backend.tools_env.registry.utils.types import AppDefinition
	from cuga.backend.utils.id_utils import mask_with_timestamp, random_id_with_timestamp
	from cuga.config import TRAJECTORY_DATA_DIR, settings
	from langchain_core.tools import StructuredTool
	from loguru import logger
	from mcp.types import CallToolResult, TextContent
	from pydantic import BaseModel, Field

	AGENT_ANALYTICS = True
	try:
	from agent_analytics.instrumentation.utils import AIEventRecorder
	from agent_analytics_core.interfaces.annotations import DataAnnotation
	except Exception:
	AGENT_ANALYTICS = False
	logger.warning("Ignoring agent analytics")


	class MergeResult(BaseModel):
	folder_name: str
	merged_task_ids: List[str]


	class Prompt(BaseModel):
	role: str
	value: str


	class Step(BaseModel):
	name: Optional[str] = ""
	plan: Optional[str] = ""
	prompts: List[Prompt] = Field(default_factory=list)
	data: Optional[str] = ""
	task_decomposition: Optional[str] = ""
	current_url: Optional[str] = ""
	action_formatted: Optional[str] = ""
	action_type: Optional[str] = ""
	action_args: Optional[Any] = ""
	observation_before: Optional[str] = ""
	image_before: Optional[str] = ""


	class TasksMetadata(BaseModel):
	task_ids: List[str]
	description: Optional[str] = ""
	experiment_name: str
	experiment_folder: str
	created_at: str


	class ActivityTracker(object):
	_instance = None
	start_time: float = 0
	user_id: str = ""
	intent: str = ""
	session_id: str = ""
	dataset_name: str = ""
	prompts: List[Prompt] = []
	current_date: Optional[str] = None
	pi: Optional[str] = None
	eval: Any = None
	final_answer: Optional[str] = None
	task_id: str = "default"
	actions_count: int = 0
	token_usage: int = 0
	steps: List[Step] = []
	images: List[str] = []
	score: float = 0.0
	tools: Dict[str, List[StructuredTool]] = {}
	apps: List[AppDefinition] = []
	# Task management attributes
	tasks: Dict[str, Dict[str, Any]] = {}
	experiment_folder: Optional[str] = None
	tasks_metadata: Optional[TasksMetadata] = None
	if settings.advanced_features.enable_memory:
	from cuga.backend.memory.memory import Memory

	memory = Memory()

	# Base directory configuration
	_base_dir: str = TRAJECTORY_DATA_DIR

	def __new__(cls, args, *kwargs):
	if not cls._instance:
	cls._instance = super(ActivityTracker, cls).__new__(cls)
	return cls._instance

	async def invoke_tool(self, server_name: str, tool_name: str, args: dict):
	if server_name not in self.tools:
	raise ValueError(f"Server '{server_name}' not found")

	# Find the tool by name
	for tool in self.tools[server_name]:
	if tool.name == tool_name:
	result = await tool.ainvoke(args)
	logger.debug(f"type of {type(result)}")
	# logger.debug(f"Tool output call {result.con}")
	# Check if result is JSON parseable
	if isinstance(result, CallToolResult):
	result = result.content[0]
	if isinstance(result, TextContent):
	result = result.text
	if isinstance(result, str):
	try:
	res = json.loads(result)
	logger.debug("json res worked!")
	return res
	except (json.JSONDecodeError, TypeError):
	logger.debug("no json tool output !!")
	# Not valid JSON, return original result
	return result
	else:
	logger.debug(f"answer is not str answer is of type {type(result)}")
	# Result is not a string, return as-is
	return result

	# Tool not found
	available_tools = [tool.name for tool in self.tools[server_name]]
	raise ValueError(
	f"Tool '{tool_name}' not found in server '{server_name}'. Available tools: {available_tools}"
	)

	def invoke_tool_sync(self, server_name: str, tool_name: str, args: dict):
	"""Synchronous version of invoke_tool to avoid async/sync context issues"""
	import asyncio
	import concurrent.futures

	if server_name not in self.tools:
	raise ValueError(f"Server '{server_name}' not found")

	# Find the tool by name
	for tool in self.tools[server_name]:
	if tool.name == tool_name:
	# Try synchronous invoke first
	try:
	result = tool.invoke(args) # Use synchronous invoke
	except RuntimeError as e:
	if "event loop is already running" in str(e):
	# We're in an async context, need to handle this differently
	try:
	# Check if we have a running loop
	asyncio.get_running_loop()

	# We're in an async context, create a new thread to run the async function
	def run_in_new_loop():
	new_loop = asyncio.new_event_loop()
	asyncio.set_event_loop(new_loop)
	try:
	# Use async invoke in the new loop
	async def async_invoke():
	return await tool.ainvoke(args)

	return new_loop.run_until_complete(async_invoke())
	finally:
	new_loop.close()

	with concurrent.futures.ThreadPoolExecutor() as executor:
	future = executor.submit(run_in_new_loop)
	result = future.result()
	except RuntimeError:
	# No running loop, use asyncio.run
	import asyncio

	async def async_invoke():
	return await tool.ainvoke(args)

	result = asyncio.run(async_invoke())
	else:
	raise
	# logger.debug(f"type of {type(result)}")
	# logger.debug(f"Tool output call {result}")
	# Check if result is JSON parseable
	if isinstance(result, CallToolResult):
	result = result.content[0]
	if isinstance(result, TextContent):
	result = result.text
	if isinstance(result, str):
	try:
	res = json.loads(result)
	logger.debug("json res worked!")
	return res
	except (json.JSONDecodeError, TypeError):
	logger.debug("no json tool output !!")
	# Not valid JSON, return original result
	return result
	else:
	logger.debug(f"answer is not str answer is of type {type(result)}")
	# Result is not a string, return as-is
	return result

	# Tool not found
	available_tools = [tool.name for tool in self.tools[server_name]]
	raise ValueError(
	f"Tool '{tool_name}' not found in server '{server_name}'. Available tools: {available_tools}"
	)

	def get_tools_by_server(self, server_name: str) -> Dict[str, Dict]:
	tools = self.tools
	if server_name not in tools:
	return {}
	server_tools = {}
	for tool in tools[server_name]:
	tool_config = {
	"app_name": server_name,
	"secure": False,
	"api_name": tool.name,
	"path": '',
	"method": '',
	"description": tool.description or '',
	"parameters": tool.args_schema.model_json_schema(),
	"response_schemas": 'Any',
	"canary_string": '',
	}

	server_tools[tool.name] = tool_config

	return server_tools

	def set_tools(self, tools: List[StructuredTool]):
	"""
	Detects application prefixes and assigns server_name to tool metadata.
	Returns list of AppDefinition objects for all detected applications.
	Optionally fills self.tools dictionary with server_name grouped tools.

	- For tools with metadata=None OR server_name=None: assigns detected app name or 'default'
	- For tools with existing server_name: leaves unchanged

	Args:
	tools (list): List of tool objects with .name and .metadata attributes
	self_tools (dict, optional): Dictionary to fill with server_name grouped tools

	Returns:
	List[AppDefinition]: List of app definitions with tools description
	"""

	self.tools = {}
	# logger.debug(f"tools: {tools}")

	# Common prefixes to exclude (HTTP methods, etc.)
	excluded_prefixes = {'get', 'post', 'put', 'delete', 'patch', 'head', 'options', 'trace'}

	# Step 1: Extract tool names for analysis (only for tools that need server_name assignment)
	tools_to_process = [
	tool for tool in tools if tool.metadata is None or tool.metadata.get("server_name", None) is None
	]
	tool_names = [tool.name for tool in tools_to_process]
	# Step 2: Find potential prefixes and count occurrences
	prefix_candidates = {}

	for tool_name in tool_names:
	# Split by underscore and take the first part as potential prefix
	if '_' in tool_name:
	potential_prefix = tool_name.split('_')[0].lower()

	# Skip if it's an excluded prefix
	if potential_prefix not in excluded_prefixes:
	if potential_prefix not in prefix_candidates:
	prefix_candidates[potential_prefix] = []
	prefix_candidates[potential_prefix].append(tool_name)

	# Step 3: Filter prefixes that appear in multiple tools (consistency check)
	detected_applications = {}
	for prefix, tool_list in prefix_candidates.items():
	if len(tool_list) > 1: # Prefix appears in multiple tools - consistent!
	detected_applications[prefix.upper()] = tool_list

	# Step 4: Assign server_name to metadata for tools that need it
	for tool in tools:
	# Only process tools with metadata=None OR server_name=None
	if tool.metadata is None or tool.metadata.get("server_name", None) is None:
	tool_name = tool.name
	server_name = 'default_app' # Default assignment

	# Check if this tool belongs to any detected application
	for app_name, app_tools in detected_applications.items():
	if tool_name in app_tools:
	server_name = app_name
	break

	# Initialize metadata if it's None, otherwise just update server_name
	if tool.metadata is None:
	tool.metadata = {"server_name": server_name}
	else:
	tool.metadata["server_name"] = server_name

	# Step 5: Fill self.tools dictionary if provided
	for tool in tools:
	# Get server_name from tool metadata
	server_name = tool.metadata.get('server_name')

	# Skip tools without server_name metadata
	if server_name is None:
	raise Exception("Tool server name is none!")

	# Initialize list for this server if it doesn't exist
	if server_name not in self.tools:
	self.tools[server_name] = []
	# Add tool to the appropriate server group
	self.tools[server_name].append(tool)

	# Step 6: Collect all unique server_names and their associated tools
	app_tools_map = {}
	for tool in tools:
	if tool.metadata is not None:
	server_name = tool.metadata.get("server_name")
	if server_name:
	if server_name not in app_tools_map:
	app_tools_map[server_name] = []
	app_tools_map[server_name].append(tool)

	# Step 7: Create AppDefinition objects
	app_definitions = []
	for app_name, tool_list in app_tools_map.items():
	tools_description = "Available tools:\n" + "\n".join(
	f"{tool.name}: {tool.description}" if tool.description else f"{tool.name}:"
	for tool in sorted(tool_list, key=lambda x: x.name)
	)

	app_def = AppDefinition(name=app_name, description=tools_description, url=None)
	app_definitions.append(app_def)

	self.apps = app_definitions

	def set_base_dir(self, base_dir: str) -> None:
	"""
	Set the base directory for logging trajectory data.

	Args:
	base_dir (str): The base directory path for storing experiment data
	"""
	self._base_dir = base_dir
	logger.info(f"Base directory set to: {self._base_dir}")

	def get_base_dir(self) -> str:
	"""
	Get the current base directory for logging trajectory data.

	Returns:
	str: The current base directory path
	"""
	return self._base_dir

	def get_current_trajectory_path(self) -> Optional[str]:
	"""
	Get the full path of the current experiment folder.

	Returns:
	Optional[str]: The full path of the experiment folder, or None if no experiment is active.
	"""
	if self.experiment_folder:
	return os.path.join(self._base_dir, self.experiment_folder, self.task_id + ".json")
	return ""

	def generate_session_id(self):
	self.session_id = random_id_with_timestamp(full_date=True)

	def reset(self, intent, task_id="default"):
	self.token_usage = 0
	self.start_time = time.time()
	self.current_date = None
	self.pi = None
	self.prompts = []
	self.steps = []
	self.images = []
	self.actions_count = 0
	self.final_answer = None
	self.task_id = task_id
	self.intent = intent
	self.user_id = None

	def reload_steps(self, task_id: Optional[str] = None) -> bool:
	"""
	Reload steps from the current experiment's task JSON file.

	Args:
	task_id (str, optional): Task ID to reload. If None, uses current task_id.

	Returns:
	bool: True if steps were successfully reloaded, False otherwise.
	"""
	# Use provided task_id or fall back to current task_id
	target_task_id = task_id if task_id is not None else self.task_id

	if not target_task_id or target_task_id == "default":
	logger.error("No valid task_id provided for reloading steps")
	return False

	# Get the trajectory path for the specified task
	self.task_id = target_task_id
	trajectory_path = self.get_current_trajectory_path()

	if not trajectory_path:
	logger.error(f"No trajectory path found for task_id: {target_task_id}")
	return False

	if not os.path.exists(trajectory_path):
	logger.error(f"Trajectory file does not exist: {trajectory_path}")
	return False

	try:
	# Read the JSON file
	with open(trajectory_path, 'r', encoding='utf-8') as f:
	trajectory_data = json.load(f)

	# Extract steps from the JSON
	steps_data = trajectory_data.get('steps', [])

	# Convert dictionaries back to Step objects
	reloaded_steps = []
	for step_dict in steps_data:
	try:
	step = Step(**step_dict)
	reloaded_steps.append(step)
	except Exception as e:
	logger.warning(f"Failed to convert step data to Step object: {e}")
	continue

	# Update current steps
	self.steps = reloaded_steps

	logger.info(f"Successfully reloaded {len(reloaded_steps)} steps for task_id: {target_task_id}")
	return True

	except (json.JSONDecodeError, IOError) as e:
	logger.error(f"Error reading trajectory file {trajectory_path}: {e}")
	return False
	except Exception as e:
	logger.error(f"Unexpected error while reloading steps: {e}")
	return False

	def start_experiment(
	self, task_ids: List[str], experiment_name: str, description: Optional[str] = ""
	) -> str:
	"""
	Start a new experiment with given task IDs.

	Args:
	task_ids (List[str]): List of task IDs for this experiment
	experiment_name (str): Name of the experiment
	description (str, optional): Description of the experiment

	Returns:
	str: The experiment folder name
	"""
	# Generate experiment folder name using mask_with_timestamp
	self.experiment_folder = mask_with_timestamp(experiment_name, full_date=True)

	# Create metadata
	self.tasks_metadata = TasksMetadata(
	task_ids=task_ids,
	description=description,
	experiment_name=experiment_name,
	experiment_folder=self.experiment_folder,
	created_at=datetime.now().isoformat(),
	)

	# Only create files and directories if tracker is enabled
	if settings.advanced_features.tracker_enabled:
	# Create directory structure
	experiment_dir = os.path.join(self._base_dir, self.experiment_folder)
	os.makedirs(experiment_dir, exist_ok=True)

	# Save metadata to file
	metadata_path = os.path.join(experiment_dir, "metadata.json")
	with open(metadata_path, 'w', encoding='utf-8') as f:
	json.dump(self.tasks_metadata.model_dump(), f, indent=2, ensure_ascii=False)

	# Initialize empty files
	self._initialize_experiment_files(experiment_dir)

	# Reset tasks dictionary
	self.tasks = {}

	if settings.advanced_features.enable_memory:
	from cuga.backend.memory.agentic_memory.client.exceptions import NamespaceNotFoundException

	try:
	self.memory.get_namespace_details(namespace_id="memory")
	except NamespaceNotFoundException:
	self.memory.create_namespace(namespace_id="memory")
	self.memory.create_run(namespace_id="memory", run_id=self.experiment_folder)

	# Start timer
	self.start_time = time.time()
	return self.experiment_folder

	def _initialize_experiment_files(self, experiment_dir: str) -> None:
	"""Initialize empty result files for the experiment."""
	# Define column order for CSV
	columns = [
	'task_id',
	'site',
	'intent',
	'agent_answer',
	'eval',
	'score',
	'exception',
	'num_steps',
	'fail_category',
	'agent_v',
	]

	# Create empty results.csv
	results_csv_path = os.path.join(experiment_dir, "results.csv")
	df = pd.DataFrame(columns=columns)
	df.to_csv(results_csv_path, index=False, encoding='utf-8')

	# Create empty results.json
	results_json_path = os.path.join(experiment_dir, "results.json")
	with open(results_json_path, 'w', encoding='utf-8') as f:
	json.dump({}, f, indent=2, ensure_ascii=False)

	# Create empty .progress file
	progress_path = os.path.join(experiment_dir, ".progress")
	with open(progress_path, 'w', encoding='utf-8') as f:
	f.write("")

	def collect_prompt(self, role: str, value: str):
	self.prompts.append(Prompt(role=role, value=value))

	def collect_tokens_usage(self, count: int) -> None:
	"""
	Increases the number of tokens used.

	Args:
	count (int): The number of times the token is used.
	"""
	self.token_usage += count

	def collect_image(self, img: str) -> None:
	if not img:
	return
	# Ensure the image string is compatible with OpenAI vision API: must be a valid URL or a data URL.
	if img.startswith("data:image") or img.startswith("http://") or img.startswith("https://"):
	self.images.append(img)
	else:
	# Assume raw base64 PNG data; prepend appropriate data URL header.
	self.images.append(f"data:image/png;base64,{img}")

	def collect_step(self, step: Step) -> None:
	"""
	Collects a step, adding it to the steps list.

	Args:
	step (Step): The description of the step to collect.
	"""

	data_json = None
	try:
	data_json = json.loads(step.data)

	except Exception:
	pass

	# Attach any collected prompts to this step so they are persisted
	if getattr(self, "prompts", None):
	try:
	step.prompts = list(self.prompts)
	except Exception:
	# Ensure prompts never break logging
	step.prompts = []
	# Attach the most recent captured image (if any) to the step
	if getattr(self, "images", None):
	try:
	# Use the last captured screenshot as the "before" image
	step.image_before = self.images[-1]
	except Exception:
	step.image_before = None
	if AGENT_ANALYTICS:
	if step.name == "TaskAnalyzerAgent":
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.RAW_TEXT,
	annotation_title="Intent",
	annotation_content=self.intent,
	)
	if step.name == "CodeAgent":
	res_obj = CodeAgentOutput(**json.loads(step.data))
	AIEventRecorder.record_data_annotation(
	name="CodeAgent",
	annotation_type=DataAnnotation.Type.CODE_GENERATION,
	annotation_title="Generated Code",
	annotation_content="\n" + res_obj.code,
	)
	AIEventRecorder.record_data_annotation(
	name="CodeAgent",
	annotation_type=DataAnnotation.Type.CODE_SNIPPET,
	annotation_title="Code output",
	annotation_content="\n" + res_obj.execution_output,
	)
	AIEventRecorder.record_data_annotation(
	name="CodeAgent",
	annotation_type=DataAnnotation.Type.RAW_TEXT,
	annotation_title="Output summary",
	annotation_content="\n" + res_obj.summary,
	)
	else:
	if data_json and isinstance(data_json, dict):
	if data_json.get('thoughts', None):
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.THOUGHT,
	annotation_title=step.name,
	annotation_content=f"{data_json.get('thoughts', None)}",
	)
	if len(list(data_json.keys())) == 1 and isinstance(
	data_json[list(data_json.keys())[0]], str
	):
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.RAW_TEXT,
	annotation_title=step.name,
	annotation_content=f"\n\n{data_json[list(data_json.keys())[0]]}",
	)
	else:
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.RAW_TEXT,
	annotation_title=step.name,
	annotation_content=json.dumps(data_json),
	)
	else:
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.RAW_TEXT,
	annotation_title=step.name,
	annotation_content=f"{step.data}",
	)
	if step.image_before:
	AIEventRecorder.record_data_annotation(
	name=step.name,
	annotation_type=DataAnnotation.Type.MULTIMODAL_DATA,
	annotation_title="Image",
	annotation_content=f"{step.image_before}",
	)

	if settings.advanced_features.enable_memory:
	from cuga.backend.memory.agentic_memory.utils.prompts import prompts

	# Include intent in step metadata so it's available during tip extraction
	step_data = step.model_dump()
	step_data['intent'] = self.intent # Add the user's task intent
	self.memory.add_step(
	namespace_id='memory',
	run_id=self.experiment_folder,
	step=step_data,
	prompt=prompts[step.name],
	)
	step.prompts = copy.deepcopy(self.prompts)
	self.prompts = []
	self.steps.append(step)

	if settings.advanced_features.enable_memory and step.name == "FinalAnswerAgent":
	# End run and execute any background processing.
	self.memory.end_run(namespace_id="memory", run_id=self.experiment_folder)

	if settings.advanced_features.tracker_enabled:
	self.to_file()
	self.prompts = []

	def collect_step_external(self, step: Step, full_path: Optional[str] = None) -> None:
	"""
	Collects a step and saves it to a separate log file in a directory
	specified by an environment variable.

	The path is retrieved from os.environ['current_folder_path'].
	The steps are saved to a file named 'recordinglg.json' in that directory.

	Args:
	step (Step): The Step object to collect.
	full_path (Optional[str]): The full file path to save to. If None, the step is skipped.

	TODO: Properly handle None full_path case - either provide a default path or make the
	calling code always provide a valid path. Currently returns early if None to avoid errors.
	"""
	try:
	if not settings.advanced_features.tracker_enabled:
	return

	# TODO: Handle None full_path properly - either use a default path or require callers to provide one
	if not full_path:
	logger.debug("Skipping external step collection: full_path is None")
	return

	if not os.path.exists(os.path.dirname(full_path)):
	logger.error(
	f"External path directory not found or does not exist: {os.path.dirname(full_path)}"
	)
	return

	step.prompts = copy.deepcopy(self.prompts)
	self.prompts = []
	self.steps.append(step)
	self._to_file_external_append(full_path, step)
	logger.info(f"Step appended to external file: {full_path}")

	except Exception as e:
	logger.error(f"Failed to collect and save external step: {e}")

	def _to_file_external_append(self, full_path: str, new_step: Step):
	"""
	Append a new step to an existing JSON file or create a new file if it doesn't exist.

	This method reads the existing file, appends the new step, and saves it back.

	Args:
	full_path (str): The full file path to save/append to.
	new_step (Step): The new step to append.
	"""
	try:
	# Check if file exists and read existing data
	if os.path.exists(full_path):
	with open(full_path, 'r', encoding='utf-8') as f:
	try:
	existing_data = json.load(f)
	# Ensure the existing data has the expected structure
	if not isinstance(existing_data, dict) or 'steps' not in existing_data:
	logger.warning(f"Invalid JSON structure in {full_path}, creating new file")
	existing_data = None
	except json.JSONDecodeError as e:
	logger.warning(f"Invalid JSON in {full_path}, creating new file: {e}")
	existing_data = None
	else:
	existing_data = None

	# If no valid existing data, create new structure
	if existing_data is None:
	data_to_save = {
	"intent": self.intent,
	"dataset_name": self.dataset_name,
	"actions_count": self.actions_count,
	"task_id": self.task_id,
	"eval": self.eval,
	"steps": [new_step.model_dump()],
	"score": self.score,
	}
	else:
	# Update existing data with new step
	existing_data["steps"].append(new_step.model_dump())
	# Update other fields that might have changed
	existing_data.update(
	{
	"intent": self.intent,
	"dataset_name": self.dataset_name,
	"actions_count": self.actions_count,
	"task_id": self.task_id,
	"eval": self.eval,
	"score": self.score,
	}
	)
	data_to_save = existing_data

	# Write the updated data back to file
	with open(full_path, 'w', encoding='utf-8') as f:
	json.dump(
	data_to_save,
	f,
	ensure_ascii=False,
	indent=4,
	)

	except Exception as e:
	logger.error(f"Failed to append step to file {full_path}: {e}")
	raise

	def collect_score(self, score: float) -> None:
	"""
	Collects a step, adding it to the steps list.

	Args:
	score (str): The description of the step to collect.
	"""
	self.score = score
	if settings.advanced_features.tracker_enabled:
	self.to_file()

	def collect_step_with_pass(self) -> None:
	"""
	Placeholder for collecting a step.
	"""
	pass

	def to_file(self):
	"""Save current task data to file in the experiment directory."""
	if self.experiment_folder:
	# Save to experiment directory
	source_dir = os.path.join(self._base_dir, self.experiment_folder)
	else:
	# Fallback to original behavior
	source_dir = "logging{}".format("_" + self.dataset_name if self.dataset_name else "")

	os.makedirs(source_dir, exist_ok=True)

	filename = self.task_id if self.task_id != "default" else self.session_id
	filepath = os.path.join(source_dir, f"{filename}.json")

	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(
	{
	"intent": self.intent,
	"dataset_name": self.dataset_name,
	"actions_count": self.actions_count,
	"task_id": self.task_id,
	"eval": self.eval,
	"steps": [d.model_dump() for d in self.steps],
	"score": self.score,
	},
	f,
	ensure_ascii=False,
	indent=4,
	)

	def finish_task(
	self,
	task_id: str,
	site: str,
	intent: str,
	agent_answer: Optional[str] = None,
	eval: Optional[str] = None,
	score: Optional[float] = None,
	exception: Optional[bool] = None,
	num_steps: Optional[int] = None,
	fail_category: Optional[str] = None,
	agent_v: Optional[str] = None,
	duration: Optional[int] = None,
	total_llm_calls: Optional[int] = None,
	total_tokens: Optional[int] = None,
	total_cost: Optional[float] = None,
	total_cache_input_tokens: Optional[int] = None,
	) -> str:
	"""
	Mark a task as finished and update result files.

	Args:
	task_id (str): Required unique identifier for the task
	site (str): Required site name
	intent (str): Task intent/description
	agent_answer (str, optional): Agent's answer
	eval (str, optional): Evaluation details
	score (float, optional): Task score
	exception (bool, optional): Whether an exception occurred
	num_steps (int, optional): Number of steps taken
	fail_category (str, optional): Category of failure if applicable
	agent_v (str, optional): Agent version

	Returns:
	str: The ID of the finished task
	"""
	if not self.experiment_folder:
	raise ValueError("No experiment started. Call start_experiment() first.")

	# Calculate number of api calls
	api_calls_num = len([step for step in self.steps if "api_call" in step.name])
	# Add task to internal storage
	self.tasks[task_id] = {
	"site": site,
	"intent": intent,
	"agent_answer": agent_answer,
	"eval": eval,
	"score": score,
	"exception": exception,
	"num_steps": num_steps if num_steps is not None else len(self.steps),
	"fail_category": fail_category,
	"agent_v": agent_v,
	"duration": duration if duration is not None else time.time() - self.start_time,
	"total_llm_calls": total_llm_calls,
	"total_tokens": self.token_usage if not total_tokens else total_tokens,
	"api_calls": api_calls_num,
	"total_cost": total_cost,
	"total_cache_input_tokens": total_cache_input_tokens,
	}

	# Update result files only if tracker is enabled
	if settings.advanced_features.tracker_enabled:
	self._update_result_files()
	self._add_to_progress_file(task_id)

	return task_id

	def _update_result_files(self) -> None:
	"""Update both JSON and CSV result files."""
	if not self.experiment_folder:
	return

	experiment_dir = os.path.join(self._base_dir, self.experiment_folder)

	# Update results.json
	results_json_path = os.path.join(experiment_dir, "results.json")
	with open(results_json_path, 'w', encoding='utf-8') as f:
	json.dump(self.tasks, f, indent=2, ensure_ascii=False)

	# Update results.csv
	self._save_csv(experiment_dir)

	def _save_csv(self, experiment_dir: str) -> None:
	"""Save current tasks to CSV file using pandas."""
	# Define the column order
	columns = [
	'task_id',
	'site',
	'intent',
	'agent_answer',
	'eval',
	'score',
	'exception',
	'num_steps',
	'fail_category',
	'agent_v',
	'duration',
	'total_llm_calls',
	'total_tokens',
	'api_calls',
	'total_cost',
	'total_cache_input_tokens',
	]

	if not self.tasks:
	# Create empty DataFrame with headers if no tasks
	df = pd.DataFrame(columns=columns)
	else:
	# Convert tasks dictionary to list of dictionaries for DataFrame
	data = []
	for task_id, task_data in self.tasks.items():
	row = {'task_id': task_id}
	row.update(task_data)
	data.append(row)

	# Create DataFrame
	df = pd.DataFrame(data)

	# Reorder columns to match the desired order
	df = df.reindex(columns=columns)

	# Save to CSV
	results_csv_path = os.path.join(experiment_dir, "results.csv")
	df.to_csv(results_csv_path, index=False, encoding='utf-8')

	def _add_to_progress_file(self, task_id: str) -> None:
	"""Add a task ID to the .progress file."""
	if not self.experiment_folder:
	return

	progress_path = os.path.join(self._base_dir, self.experiment_folder, ".progress")
	with open(progress_path, 'a', encoding='utf-8') as f:
	f.write(task_id + '\n')

	def update_task(
	self,
	task_id: str,
	site: Optional[str] = None,
	intent: Optional[str] = None,
	agent_answer: Optional[str] = None,
	eval: Optional[str] = None,
	score: Optional[float] = None,
	exception: Optional[bool] = None,
	num_steps: Optional[int] = None,
	fail_category: Optional[str] = None,
	agent_v: Optional[str] = None,
	) -> bool:
	"""
	Update an existing task.

	Args:
	task_id (str): ID of the task to update
	site (str, optional): New site
	intent (str, optional): New intent
	agent_answer (str, optional): New agent answer
	eval (str, optional): New evaluation
	score (float, optional): New score
	exception (bool, optional): New exception status
	num_steps (int, optional): New number of steps
	fail_category (str, optional): New fail category
	agent_v (str, optional): New agent version

	Returns:
	bool: True if task was updated, False if task not found
	"""
	if task_id not in self.tasks:
	return False

	# Update only provided fields
	if site is not None:
	self.tasks[task_id]["site"] = site
	if intent is not None:
	self.tasks[task_id]["intent"] = intent
	if agent_answer is not None:
	self.tasks[task_id]["agent_answer"] = agent_answer
	if eval is not None:
	self.tasks[task_id]["eval"] = eval
	if score is not None:
	self.tasks[task_id]["score"] = score
	if exception is not None:
	self.tasks[task_id]["exception"] = exception
	if num_steps is not None:
	self.tasks[task_id]["num_steps"] = num_steps
	if fail_category is not None:
	self.tasks[task_id]["fail_category"] = fail_category
	if agent_v is not None:
	self.tasks[task_id]["agent_v"] = agent_v

	if settings.advanced_features.tracker_enabled:
	self._update_result_files()
	return True

	def remove_task(self, task_id: str) -> bool:
	"""
	Remove a task from the results.

	Args:
	task_id (str): ID of the task to remove

	Returns:
	bool: True if task was removed, False if task not found
	"""
	if task_id in self.tasks:
	del self.tasks[task_id]
	if settings.advanced_features.tracker_enabled:
	self._update_result_files()
	return True
	return False

	def get_task(self, task_id: str) -> Optional[Dict[str, Any]]:
	"""
	Get a specific task by ID.

	Args:
	task_id (str): ID of the task to retrieve

	Returns:
	Dict containing task data or None if not found
	"""
	return self.tasks.get(task_id)

	def get_all_tasks(self) -> Dict[str, Dict[str, Any]]:
	"""
	Get all tasks.

	Returns:
	Dict containing all tasks
	"""
	return self.tasks.copy()

	def find_tasks_by_score(self, score: float) -> Dict[str, Dict[str, Any]]:
	"""
	Find all tasks with a specific score.

	Args:
	score (float): Score to search for

	Returns:
	Dict containing matching tasks
	"""
	return {task_id: task for task_id, task in self.tasks.items() if task.get("score") == score}

	def find_tasks_by_site(self, site: str) -> Dict[str, Dict[str, Any]]:
	"""
	Find all tasks with a specific site.

	Args:
	site (str): Site to search for

	Returns:
	Dict containing matching tasks
	"""
	return {task_id: task for task_id, task in self.tasks.items() if task.get("site") == site}

	def find_tasks_by_exception(self, exception: bool) -> Dict[str, Dict[str, Any]]:
	"""
	Find all tasks with specific exception status.

	Args:
	exception (bool): Exception status to search for

	Returns:
	Dict containing matching tasks
	"""
	return {task_id: task for task_id, task in self.tasks.items() if task.get("exception") == exception}

	def find_tasks_by_agent_version(self, agent_v: str) -> Dict[str, Dict[str, Any]]:
	"""
	Find all tasks with a specific agent version.

	Args:
	agent_v (str): Agent version to search for

	Returns:
	Dict containing matching tasks
	"""
	return {task_id: task for task_id, task in self.tasks.items() if task.get("agent_v") == agent_v}

	def clear_all_tasks(self) -> None:
	"""Remove all tasks from result files."""
	self.tasks = {}
	if self.experiment_folder and settings.advanced_features.tracker_enabled:
	self._update_result_files()
	# Clear progress file
	progress_path = os.path.join(self._base_dir, self.experiment_folder, ".progress")
	with open(progress_path, 'w', encoding='utf-8') as f:
	f.truncate(0)

	def get_task_count(self) -> int:
	"""
	Get the total number of tasks.

	Returns:
	int: Number of tasks
	"""
	return len(self.tasks)

	def get_statistics(self) -> Dict[str, Any]:
	"""
	Get basic statistics about the tasks.

	Returns:
	Dict containing task statistics
	"""
	if not self.tasks:
	return {"total_tasks": 0}

	stats = {
	"total_tasks": len(self.tasks),
	"tasks_with_exceptions": len([t for t in self.tasks.values() if t.get("exception") is True]),
	"tasks_without_exceptions": len([t for t in self.tasks.values() if t.get("exception") is False]),
	"unique_sites": len(set(t.get("site") for t in self.tasks.values() if t.get("site"))),
	"unique_agent_versions": len(
	set(t.get("agent_v") for t in self.tasks.values() if t.get("agent_v"))
	),
	}

	# Score statistics
	scores = [t.get("score") for t in self.tasks.values() if t.get("score") is not None]
	if scores:
	stats["average_score"] = sum(scores) / len(scores)
	stats["min_score"] = min(scores)
	stats["max_score"] = max(scores)

	return stats

	def get_dataframe(self) -> pd.DataFrame:
	"""
	Get all tasks as a pandas DataFrame.

	Returns:
	pd.DataFrame: DataFrame containing all tasks
	"""
	columns = [
	'task_id',
	'site',
	'intent',
	'agent_answer',
	'eval',
	'score',
	'exception',
	'num_steps',
	'fail_category',
	'agent_v',
	]

	if not self.tasks:
	return pd.DataFrame(columns=columns)

	data = []
	for task_id, task_data in self.tasks.items():
	row = {'task_id': task_id}
	row.update(task_data)
	data.append(row)

	df = pd.DataFrame(data)
	return df.reindex(columns=columns)

	def _copy_task_json_files(
	self,
	source_folders: List[str],
	target_folder: str,
	selected_task_ids: List[str],
	base_dir: str = None,
	) -> None:
	"""
	Copy individual task JSON files from source folders to target folder.

	Args:
	source_folders (List[str]): List of source experiment folder names
	target_folder (str): Target experiment folder name
	selected_task_ids (List[str]): List of task IDs to copy
	base_dir (str, optional): Base directory. If None, uses instance base_dir
	"""
	if base_dir is None:
	base_dir = self._base_dir

	target_dir = os.path.join(base_dir, target_folder)

	copied_files = 0
	skipped_files = 0

	for task_id in selected_task_ids:
	file_found = False

	# Look for the task JSON file in each source folder
	for folder_name in source_folders:
	source_dir = os.path.join(base_dir, folder_name)
	source_file = os.path.join(source_dir, f"{task_id}.json")

	if os.path.exists(source_file):
	target_file = os.path.join(target_dir, f"{task_id}.json")

	try:
	# Copy the file
	shutil.copy2(source_file, target_file)
	logger.debug(f"Copied {task_id}.json from {folder_name}")
	copied_files += 1
	file_found = True
	break # Found and copied, move to next task

	except Exception as e:
	logger.error(f"Failed to copy {task_id}.json from {folder_name}: {e}")

	if not file_found:
	logger.warning(f"Task JSON file {task_id}.json not found in any source folder")
	skipped_files += 1

	logger.info(f"Task JSON files - Copied: {copied_files}, Skipped: {skipped_files}")

	def merge_experiments(
	self,
	experiment_folders: List[str],
	output_experiment_name: str,
	description: Optional[str] = "Merged experiments",
	output_folder: Optional[str] = None,
	) -> MergeResult:
	"""
	Merge multiple experiment folders, preferring tasks with score 1.0 over 0.0.
	Also copies individual task JSON files from source experiments.

	Args:
	experiment_folders (List[str]): List of experiment folder names to merge
	output_experiment_name (str): Name for the merged experiment
	description (str, optional): Description for the merged experiment

	Returns:
	MergeResult: Contains folder_name and merged_task_ids
	"""
	logger.info(f"Starting merge of {len(experiment_folders)} experiments")

	# Create new experiment for merged results
	merged_folder = self.start_experiment(
	task_ids=[], # Will be populated with merged task IDs
	experiment_name=output_experiment_name,
	description=description,
	)

	merged_tasks = {}
	all_task_ids = set()
	task_source_mapping = {} # Track which folder each task came from

	# First pass: collect all tasks and identify duplicates
	for folder_name in experiment_folders:
	folder_path = os.path.join(self._base_dir, folder_name)
	results_json_path = os.path.join(folder_path, "results.json")

	if not os.path.exists(results_json_path):
	logger.warning(f"Results file not found in {folder_name}, skipping")
	continue

	try:
	with open(results_json_path, 'r', encoding='utf-8') as f:
	folder_tasks = json.load(f)

	logger.info(f"Processing {len(folder_tasks)} tasks from {folder_name}")

	for task_id, task_data in folder_tasks.items():
	all_task_ids.add(task_id)

	if task_id not in merged_tasks:
	# First occurrence of this task
	merged_tasks[task_id] = {**task_data, 'source_experiment': folder_name}
	task_source_mapping[task_id] = folder_name
	logger.debug(f"Added new task {task_id} from {folder_name}")
	else:
	# Task already exists, apply preference logic
	existing_score = merged_tasks[task_id].get('score', 0.0)
	new_score = task_data.get('score', 0.0)
	if existing_score == 1.0 and new_score != 1.0:
	# Keep existing (perfect score)
	should_replace = False
	elif existing_score != 1.0 and new_score == 1.0:
	# Replace with perfect score
	should_replace = True
	elif existing_score == new_score:
	# Same score, keep existing (first found)
	should_replace = False
	else:
	# Different scores, prefer higher
	should_replace = new_score > existing_score

	if should_replace:
	merged_tasks[task_id] = {**task_data, 'source_experiment': folder_name}
	task_source_mapping[task_id] = folder_name
	logger.debug(
	f"Replaced task {task_id}: {existing_score} -> {new_score} from {folder_name}"
	)
	else:
	logger.debug(
	f"Kept existing task {task_id}: score {existing_score} vs {new_score}"
	)

	except Exception as e:
	logger.error(f"Error processing {folder_name}: {e}")
	continue

	# Update the merged experiment with final task list
	self.tasks = merged_tasks

	# Update metadata with actual task IDs
	if self.tasks_metadata:
	self.tasks_metadata.task_ids = list(all_task_ids)

	# Save updated metadata
	experiment_dir = os.path.join(self._base_dir, merged_folder)
	metadata_path = os.path.join(experiment_dir, "metadata.json")
	with open(metadata_path, 'w', encoding='utf-8') as f:
	json.dump(self.tasks_metadata.model_dump(), f, indent=2, ensure_ascii=False)

	# Update result files with merged data only if tracker is enabled
	if settings.advanced_features.tracker_enabled:
	self._update_result_files()

	# Update progress file with all task IDs
	for task_id in merged_tasks.keys():
	self._add_to_progress_file(task_id)

	# Copy individual task JSON files only if tracker is enabled
	if settings.advanced_features.tracker_enabled:
	logger.info("Copying individual task JSON files...")
	selected_task_ids = list(merged_tasks.keys())
	self._copy_task_json_files(experiment_folders, merged_folder, selected_task_ids)

	logger.success(f"Successfully merged {len(merged_tasks)} tasks into {merged_folder}")
	logger.info(f"Source experiments: {experiment_folders}")
	score_distribution = {}
	source_distribution = {}
	for task_data in merged_tasks.values():
	score = task_data.get('score', 0.0)
	source = task_data.get('source_experiment', 'unknown')
	score_distribution[score] = score_distribution.get(score, 0) + 1
	source_distribution[source] = source_distribution.get(source, 0) + 1

	logger.info(f"Score distribution in merged results: {score_distribution}")
	logger.info(f"Source distribution in merged results: {source_distribution}")

	# Return MergeResult
	return MergeResult(folder_name=merged_folder, merged_task_ids=list(merged_tasks.keys()))

	def list_experiment_folders(self, base_path: Optional[str] = None) -> List[str]:
	"""
	List all available experiment folders.

	Args:
	base_path (str, optional): Base directory to search for experiments.
	If None, uses instance base_dir

	Returns:
	List[str]: List of experiment folder names
	"""
	if base_path is None:
	base_path = self._base_dir

	if not os.path.exists(base_path):
	logger.warning(f"Base path {base_path} does not exist")
	return []

	folders = []
	for item in os.listdir(base_path):
	item_path = os.path.join(base_path, item)
	if os.path.isdir(item_path):
	# Check if it looks like an experiment folder (has metadata.json)
	metadata_path = os.path.join(item_path, "metadata.json")
	if os.path.exists(metadata_path):
	folders.append(item)

	logger.info(f"Found {len(folders)} experiment folders")
	return sorted(folders)

	@staticmethod
	def list_experiment_folders_static(base_path: str = "./logging/trajectory_data") -> List[str]:
	"""
	Static method to list all available experiment folders.

	Args:
	base_path (str): Base directory to search for experiments

	Returns:
	List[str]: List of experiment folder names
	"""
	if not os.path.exists(base_path):
	logger.warning(f"Base path {base_path} does not exist")
	return []

	folders = []
	for item in os.listdir(base_path):
	item_path = os.path.join(base_path, item)
	if os.path.isdir(item_path):
	# Check if it looks like an experiment folder (has metadata.json)
	metadata_path = os.path.join(item_path, "metadata.json")
	if os.path.exists(metadata_path):
	folders.append(item)

	logger.info(f"Found {len(folders)} experiment folders")
	return sorted(folders)

	def get_experiment_progress(self, experiment_folder_name: str) -> Dict[str, Any]:
	"""
	Get the progress of a specific experiment.

	Args:
	experiment_folder_name (str): The name of the experiment folder.

	Returns:
	Dict[str, Any]: A dictionary containing 'total_tasks', 'completed_tasks', and 'uncompleted_task_ids'.
	Returns default values if files are not found or errors occur.
	"""
	experiment_dir = os.path.join(self._base_dir, experiment_folder_name)
	metadata_path = os.path.join(experiment_dir, "metadata.json")
	progress_path = os.path.join(experiment_dir, ".progress")

	total_tasks = 0
	completed_tasks = 0
	all_task_ids = set()
	completed_task_ids = set()

	# Read total tasks from metadata.json
	if os.path.exists(metadata_path):
	try:
	with open(metadata_path, 'r', encoding='utf-8') as f:
	metadata = json.load(f)
	all_task_ids = set(metadata.get('task_ids', []))
	total_tasks = len(all_task_ids)
	except (json.JSONDecodeError, IOError) as e:
	logger.error(f"Error reading metadata.json for {experiment_folder_name}: {e}")
	else:
	logger.warning(f"metadata.json not found for experiment: {experiment_folder_name}")

	# Read completed tasks from .progress
	if os.path.exists(progress_path):
	try:
	with open(progress_path, 'r', encoding='utf-8') as f:
	completed_task_ids = set(line.strip() for line in f if line.strip())
	completed_tasks = len(completed_task_ids)
	except IOError as e:
	logger.error(f"Error reading .progress file for {experiment_folder_name}: {e}")
	else:
	logger.info(f".progress file not found for experiment: {experiment_folder_name}")

	uncompleted_task_ids = list(sorted(list(all_task_ids - completed_task_ids)))

	return {
	"total_tasks": total_tasks,
	"completed_tasks": completed_tasks,
	"uncompleted_task_ids": uncompleted_task_ids,
	}