diff --git a/packages/dash_evals/pyrefly.toml b/packages/dash_evals/pyrefly.toml index d35f141..04c8362 100644 --- a/packages/dash_evals/pyrefly.toml +++ b/packages/dash_evals/pyrefly.toml @@ -1,4 +1,4 @@ # Pyrefly configuration # Tell Pyrefly to use the repo-root venv Python interpreter -python-interpreter = "../../.venv/bin/python" +python-interpreter = "../../../dash_evals/.venv/bin/python" diff --git a/packages/dash_evals/src/dash_evals/runner/solvers/setup_workspace.py b/packages/dash_evals/src/dash_evals/runner/solvers/setup_workspace.py index d0d4251..3d6eb81 100644 --- a/packages/dash_evals/src/dash_evals/runner/solvers/setup_workspace.py +++ b/packages/dash_evals/src/dash_evals/runner/solvers/setup_workspace.py @@ -103,9 +103,17 @@ async def _setup_local_workspace(state: TaskState) -> TaskState: # absolute paths that are invalid after copying to a new location. dep_cmd = state.metadata.get("dep_install_cmd", ["flutter", "pub", "get"]) sb = sandbox() + + working_dir_arg = state.metadata.get("working_dir") + if working_dir_arg: + working_dir = workspace_copy / working_dir_arg + else: + working_dir = workspace_copy + state.metadata["working_dir"] = str(working_dir) + dep_result = await sb.exec( dep_cmd, - cwd=str(workspace_copy), + cwd=str(working_dir), ) if not dep_result.success: diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/__init__.py b/packages/dash_evals/src/dash_evals/runner/tasks/__init__.py index 4430e07..38fabe8 100644 --- a/packages/dash_evals/src/dash_evals/runner/tasks/__init__.py +++ b/packages/dash_evals/src/dash_evals/runner/tasks/__init__.py @@ -1,6 +1,7 @@ from .analyze_codebase import analyze_codebase from .bug_fix import bug_fix, flutter_bug_fix from .code_gen import code_gen, flutter_code_gen +from .mcp_coding_task import mcp_coding_task from .mcp_tool import mcp_tool from .question_answer import question_answer from .skill_test import skill_test @@ -11,6 +12,7 @@ "code_gen", "flutter_bug_fix", "flutter_code_gen", + "mcp_coding_task", "mcp_tool", "question_answer", "skill_test", diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/mcp_coding_task.py b/packages/dash_evals/src/dash_evals/runner/tasks/mcp_coding_task.py new file mode 100644 index 0000000..807a990 --- /dev/null +++ b/packages/dash_evals/src/dash_evals/runner/tasks/mcp_coding_task.py @@ -0,0 +1,75 @@ +"""Tests the agent's ability to use the Dart MCP server""" + +from inspect_ai import Task, task +from inspect_ai.dataset import Dataset +from inspect_ai.model import ChatMessageSystem +from inspect_ai.scorer import model_graded_fact +from inspect_ai.solver import Generate, Solver, TaskState, solver + +from dash_evals.runner.scorers import export_workspace, mcp_tool_usage +from dash_evals.runner.solvers import setup_workspace + +from .task_helpers import ( + append_context_injection, + append_model_interaction, + build_task_metadata, +) + + +@solver +def _add_working_dir_system_message() -> Solver: + """Adds a dynamic system message with the working directory.""" + + async def solve(state: TaskState, generate: Generate) -> TaskState: + working_dir = state.metadata.get("working_dir", "") + host_workspace = state.metadata.get("host_workspace") + + if host_workspace: + # Container sandbox + current_dir = f"/workspace/{working_dir}" + else: + # Local sandbox + current_dir = working_dir + + message = f""" +You are an expert Dart and Flutter developer. Use all the tools available to +you to accomplish the task and ensure the result is free of errors. + +The current project directory is {current_dir} + +For MCP tools, use the following root path: +file://{current_dir} +""" + state.messages.insert(0, ChatMessageSystem(content=message.strip())) + return state + + return solve + + +@task +def mcp_coding_task(dataset: Dataset, config: dict) -> Task: + """ + Tests the agent's ability to use the Dart MCP server for generic coding tasks. + + Args: + dataset: Inspect dataset loaded from JSONL. + config: Task configuration containing dataset, context, and variant. + """ + solver_chain = [_add_working_dir_system_message()] + + append_context_injection(solver_chain, config) + append_model_interaction(solver_chain, config) + + scorers: list = [model_graded_fact(), mcp_tool_usage()] + if config.get("save_examples"): + scorers.append(export_workspace()) + + return Task( + name=config["task_name"], + dataset=dataset, + setup=[setup_workspace()], + solver=solver_chain, + scorer=scorers, + time_limit=300, + metadata=build_task_metadata(config), + ) diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/mcp_tool.py b/packages/dash_evals/src/dash_evals/runner/tasks/mcp_tool.py index ff04800..2d4bdd3 100644 --- a/packages/dash_evals/src/dash_evals/runner/tasks/mcp_tool.py +++ b/packages/dash_evals/src/dash_evals/runner/tasks/mcp_tool.py @@ -39,7 +39,9 @@ def mcp_tool(dataset: Dataset, config: dict) -> Task: - system_message: custom system prompt (optional) """ required_tools = config.get("required_tools", []) - inject_temp_dir = config.get("inject_temp_dir", False) + # inject_temp_dir can be set via task.yaml metadata.task_parameters + task_params = (config.get("metadata") or {}).get("task_parameters") or {} + inject_temp_dir = config.get("inject_temp_dir", False) or task_params.get("inject_temp_dir", False) # Pre-process samples if temp directory injection is needed active_dataset = dataset diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/skill_test.py b/packages/dash_evals/src/dash_evals/runner/tasks/skill_test.py index b695732..98235aa 100644 --- a/packages/dash_evals/src/dash_evals/runner/tasks/skill_test.py +++ b/packages/dash_evals/src/dash_evals/runner/tasks/skill_test.py @@ -60,7 +60,11 @@ def _build_solver_chain(config: dict, system_message: str) -> list: # Build tools list — skill tool is required for this task type skill_tool = get_skill_tool(config) - tools = [bash(timeout=120)] + tools: list = [] + # bash() requires a real sandbox (Docker/Podman), skip for local runs + sandbox_type = config.get("sandbox_type", "local") + if sandbox_type != "local": + tools.append(bash(timeout=120)) if skill_tool is not None: tools.append(skill_tool) diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py b/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py index 767a06e..b4856e8 100644 --- a/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py +++ b/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py @@ -118,11 +118,11 @@ def append_model_interaction( if mcp_servers_config: sandbox_type = config.get("sandbox_type", "local") - tools.extend(create_mcp_servers(mcp_servers_config, sandbox_type)) + tools.extend(cast(list[Tool | MCPServer], create_mcp_servers(mcp_servers_config, sandbox_type))) skill_tool = get_skill_tool(config) if skill_tool: - tools.append(skill_tool) + tools.append(cast(Tool | MCPServer, skill_tool)) if extra_tools: tools.extend(extra_tools) diff --git a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart index a8d2e33..b79c07b 100644 --- a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart +++ b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart @@ -29,17 +29,18 @@ class YamlParser extends Parser { final taskConfigs = []; - final taskDirs = tasksDir.listSync().whereType().toList() - ..sort((a, b) => a.path.compareTo(b.path)); + // Recursive search for task.yaml files + final taskFiles = tasksDir + .listSync(recursive: true) + .whereType() + .where((f) => p.basename(f.path) == 'task.yaml') + .toList(); - for (final taskDir in taskDirs) { - final taskFile = File(p.join(taskDir.path, 'task.yaml')); - if (taskFile.existsSync()) { - taskConfigs.addAll(_loadTaskFile(taskFile.path, datasetRoot)); - } + for (final taskFile in taskFiles) { + taskConfigs.addAll(_loadTaskFile(taskFile.path, datasetRoot)); } - return taskConfigs; + return taskConfigs..sort((a, b) => a.id.compareTo(b.id)); } /// Load a single task.yaml file into a [ParsedTask]. @@ -75,8 +76,9 @@ class YamlParser extends Parser { } else { final datasetMap = Map.from(datasetRaw); final formatKeys = {'samples', 'json', 'csv'}; - final presentKeys = - formatKeys.intersection(datasetMap.keys.toSet().cast()); + final presentKeys = formatKeys.intersection( + datasetMap.keys.toSet().cast(), + ); if (presentKeys.length > 1) { throw FormatException( "Task '$taskId': 'dataset' must have exactly one of " @@ -142,7 +144,11 @@ class YamlParser extends Parser { final earlyStopping = taskArgs['early_stopping']; final displayName = data['display_name'] as String?; final version = data['version']; - final taskMetadata = _asMap(data['metadata']); + final taskMetadata = { + if (data.containsKey('workspace')) 'workspace': data['workspace'], + if (data.containsKey('working_dir')) 'working_dir': data['working_dir'], + ...?_asMap(data['metadata']), + }; final sandboxParameters = _asMap(data['sandbox_parameters']); return [ @@ -353,7 +359,7 @@ class YamlParser extends Parser { final data = readYamlFileAsMap(jobPath); - final logsDir = (data['logs_dir'] as String?) ?? _kDefaultLogsDir; + final logsDir = (data['log_dir'] as String?) ?? _kDefaultLogsDir; final maxConnections = (data['max_connections'] as int?) ?? 10; // Resolve log directory with timestamp @@ -414,6 +420,12 @@ class YamlParser extends Parser { } final models = modelsRaw.cast(); + final inspectEvalArgs = + _asMap(data['inspect_eval_arguments']) ?? {}; + if (data.containsKey('working_limit')) { + inspectEvalArgs['working_limit'] = data['working_limit']; + } + return Job( logDir: logDir, maxConnections: maxConnections, @@ -426,9 +438,9 @@ class YamlParser extends Parser { sampleFilters: sampleFilters, saveExamples: data['save_examples'] == true, // Sandbox configuration - sandbox: _parseSandbox(data['sandbox']), + sandbox: _parseSandbox(data['sandbox'] ?? data['sandbox_type']), // All inspect eval arguments - inspectEvalArguments: _asMap(data['inspect_eval_arguments']), + inspectEvalArguments: inspectEvalArgs, ); } @@ -472,9 +484,6 @@ class YamlParser extends Parser { return null; } - - - // ------------------------------------------------------------------ // Log dir helpers // ------------------------------------------------------------------ @@ -522,42 +531,39 @@ class YamlParser extends Parser { /// /// Throws [FileSystemException] if the job file is not found. String findJobFile(String datasetRoot, String job) { - // Check if it's a path (contains / or ends with .yaml) + final jobsDir = Directory(p.join(datasetRoot, 'jobs')); + + // 1. Try relative to jobs/ directory + if (jobsDir.existsSync()) { + // Try literally (e.g. "skills/skill.yaml") + final path1 = p.join(jobsDir.path, job); + if (File(path1).existsSync()) return p.normalize(path1); + + // Try with .yaml extension (e.g. "skills/skill" -> "skills/skill.yaml") + final path2 = '$path1.yaml'; + if (File(path2).existsSync()) return p.normalize(path2); + } + + // 2. Try as absolute or relative to dataset root + // (only if it contains a slash or ends in .yaml to avoid ambiguous discovery) if (job.contains('/') || job.endsWith('.yaml')) { final jobPath = p.isAbsolute(job) ? job : p.join(datasetRoot, job); - if (!File(jobPath).existsSync()) { - throw FileSystemException('Job file not found', jobPath); - } - return p.normalize(jobPath); + if (File(jobPath).existsSync()) return p.normalize(jobPath); } - // Look in jobs/ directory - final jobsDir = Directory(p.join(datasetRoot, 'jobs')); - if (!jobsDir.existsSync()) { - throw FileSystemException( - 'Jobs directory not found. ' - 'Create it or specify a full path to the job file.', - jobsDir.path, - ); + // List available jobs for helpful error message (top-level only for now) + var available = []; + if (jobsDir.existsSync()) { + available = jobsDir + .listSync() + .whereType() + .where((f) => f.path.endsWith('.yaml')) + .map((f) => p.basenameWithoutExtension(f.path)) + .toList(); } - // Try with .yaml extension - final withExt = File(p.join(jobsDir.path, '$job.yaml')); - if (withExt.existsSync()) return p.normalize(withExt.path); - - // Try without extension (maybe they included it) - final withoutExt = File(p.join(jobsDir.path, job)); - if (withoutExt.existsSync()) return p.normalize(withoutExt.path); - - // List available jobs for helpful error message - final available = jobsDir - .listSync() - .whereType() - .where((f) => f.path.endsWith('.yaml')) - .map((f) => p.basenameWithoutExtension(f.path)) - .toList(); throw FileSystemException( - "Job '$job' not found in ${jobsDir.path}. " - 'Available jobs: ${available.isEmpty ? '(none)' : available}', + "Job '$job' not found. Checked 'jobs/' and dataset root. " + 'Available top-level jobs: ${available.isEmpty ? '(none)' : available}', ); } diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart index ec9b36c..887a18e 100644 --- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart +++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart @@ -455,7 +455,12 @@ class EvalSetResolver { // Create one ParsedTask per effective variant for (final entry in effectiveVariants.entries) { - final variant = _resolveVariant(entry.key, entry.value, datasetRoot); + final variant = _resolveVariant( + entry.key, + entry.value, + datasetRoot, + taskId, + ); // Compute examples_dir from job log_dir String? examplesDir; @@ -489,14 +494,15 @@ class EvalSetResolver { String name, Map vDef, String datasetRoot, + String taskId, ) { if (vDef.isEmpty) return Variant(name: name); // Load context files (with glob support) final files = []; - final cfPaths = - (vDef['files'] as List?)?.cast() ?? const []; - for (final cfPath in cfPaths) { + final cfPaths = (vDef['files'] as List?)?.cast() ?? const []; + for (var cfPath in cfPaths) { + cfPath = cfPath.replaceAll('{task_id}', taskId); if (_isGlob(cfPath)) { final matched = _expandGlobFiles(datasetRoot, cfPath); if (matched.isEmpty) { @@ -515,9 +521,9 @@ class EvalSetResolver { // Resolve skill paths (with glob support) final skills = []; - final rawSkills = - (vDef['skills'] as List?)?.cast() ?? const []; - for (final skillPathStr in rawSkills) { + final rawSkills = (vDef['skills'] as List?)?.cast() ?? const []; + for (var skillPathStr in rawSkills) { + skillPathStr = skillPathStr.replaceAll('{task_id}', taskId); if (_isGlob(skillPathStr)) { final matched = _expandGlobDirs(datasetRoot, skillPathStr); final validDirs = matched diff --git a/packages/dataset_config_python/src/dataset_config_python/parser.py b/packages/dataset_config_python/src/dataset_config_python/parser.py index 218b840..9c8c2ca 100644 --- a/packages/dataset_config_python/src/dataset_config_python/parser.py +++ b/packages/dataset_config_python/src/dataset_config_python/parser.py @@ -140,7 +140,9 @@ def copy_with( system_message=self.system_message if system_message is _U else system_message, save_examples=self.save_examples if save_examples is _U else save_examples, # type: ignore[arg-type] examples_dir=self.examples_dir if examples_dir is _U else examples_dir, - sandbox_parameters=self.sandbox_parameters if sandbox_parameters is _U else sandbox_parameters, + sandbox_parameters=self.sandbox_parameters + if sandbox_parameters is _U + else sandbox_parameters, task_files=self.task_files if task_files is _U else task_files, task_setup=self.task_setup if task_setup is _U else task_setup, model=self.model if model is _U else model, @@ -192,15 +194,11 @@ def _read_yaml_file(path: str) -> dict[str, Any]: return data -def _resolve_log_dir(logs_dir: str, base_dir: str) -> str: +def _resolve_log_dir(log_dir: str, base_dir: str) -> str: """Resolve log directory with a timestamp subfolder.""" now = datetime.now(timezone.utc) timestamp = now.strftime("%Y-%m-%d_%H-%M-%S") - return os.path.normpath(os.path.join(base_dir, logs_dir, timestamp)) - - - - + return os.path.normpath(os.path.join(base_dir, log_dir, timestamp)) # --------------------------------------------------------------------------- @@ -209,20 +207,20 @@ def _resolve_log_dir(logs_dir: str, base_dir: str) -> str: def parse_tasks(dataset_root: str) -> list[ParsedTask]: - """Parse all task.yaml files from tasks/ subdirectories.""" + """Parse all task.yaml files from tasks/ subdirectories (recursive).""" tasks_dir = os.path.join(dataset_root, "tasks") if not os.path.isdir(tasks_dir): return [] parsed = [] - for entry in sorted(os.listdir(tasks_dir)): - task_dir = os.path.join(tasks_dir, entry) - if not os.path.isdir(task_dir): - continue - task_file = os.path.join(task_dir, "task.yaml") - if os.path.isfile(task_file): + # Recursive search for task.yaml files + for root, _, files in os.walk(tasks_dir): + if "task.yaml" in files: + task_file = os.path.join(root, "task.yaml") parsed.extend(_load_task_file(task_file, dataset_root)) + # Stable order for evaluation runs + parsed.sort(key=lambda t: t.id) return parsed @@ -263,7 +261,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]: ) # Check for mutually exclusive format keys - format_keys = {'samples', 'json', 'csv'} + format_keys = {"samples", "json", "csv"} present_keys = format_keys & set(dataset_raw.keys()) if len(present_keys) > 1: raise ValueError( @@ -294,6 +292,12 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]: dataset_format = "csv" dataset_source = str(dataset_raw["csv"]) + # Task-level metadata: collect extra top-level fields for parity + metadata = dict(data.get("metadata") or {}) + for field in ("workspace", "working_dir"): + if field in data and field not in metadata: + metadata[field] = data[field] + # Task-level Inspect AI args are nested under inspect_task_args task_args = data.get("inspect_task_args") or {} @@ -306,7 +310,9 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]: system_message=system_message, model=task_args.get("model"), config=task_args.get("config") if isinstance(task_args.get("config"), dict) else None, - model_roles=task_args.get("model_roles") if isinstance(task_args.get("model_roles"), dict) else None, + model_roles=task_args.get("model_roles") + if isinstance(task_args.get("model_roles"), dict) + else None, sandbox=task_args.get("sandbox"), approval=task_args.get("approval"), epochs=task_args.get("epochs"), @@ -316,12 +322,16 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]: token_limit=task_args.get("token_limit"), time_limit=task_args.get("time_limit"), working_limit=task_args.get("working_limit"), - cost_limit=float(task_args["cost_limit"]) if task_args.get("cost_limit") is not None else None, + cost_limit=float(task_args["cost_limit"]) + if task_args.get("cost_limit") is not None + else None, early_stopping=task_args.get("early_stopping"), display_name=data.get("display_name"), version=data.get("version"), - metadata=data.get("metadata") if isinstance(data.get("metadata"), dict) else None, - sandbox_parameters=data.get("sandbox_parameters") if isinstance(data.get("sandbox_parameters"), dict) else None, + metadata=metadata if isinstance(metadata, dict) else None, + sandbox_parameters=data.get("sandbox_parameters") + if isinstance(data.get("sandbox_parameters"), dict) + else None, task_files=task_files, task_setup=task_setup, dataset_format=dataset_format, @@ -392,9 +402,7 @@ def _load_samples_from_files( continue data = yaml.safe_load(doc) if isinstance(data, dict): - samples.append( - _resolve_sample(data, sample_dir, dataset_root, task_files) - ) + samples.append(_resolve_sample(data, sample_dir, dataset_root, task_files)) return samples @@ -408,9 +416,7 @@ def _resolve_sample( """Resolve a single sample dict into a Sample.""" for field in ("id", "input", "target"): if field not in doc: - raise ValueError( - f"Sample '{doc.get('id', 'unknown')}' missing required field: {field}" - ) + raise ValueError(f"Sample '{doc.get('id', 'unknown')}' missing required field: {field}") # Read metadata fields from the metadata dict meta_raw: dict[str, Any] = doc.get("metadata") or {} @@ -465,11 +471,21 @@ def parse_job(job_path: str, dataset_root: str) -> Job: data = _read_yaml_file(job_path) - logs_dir = data.get("logs_dir") or _DEFAULT_LOGS_DIR - log_dir = _resolve_log_dir(logs_dir, dataset_root) + log_dir = data.get("log_dir") or _DEFAULT_LOGS_DIR + log_dir = _resolve_log_dir(log_dir, dataset_root) - # Parse sandbox config - sandbox_raw = data.get("sandbox") + # Parse inspect_eval_arguments and swallow top-level parity fields + inspect_eval_arguments = data.get("inspect_eval_arguments") + if isinstance(inspect_eval_arguments, dict): + inspect_eval_arguments = dict(inspect_eval_arguments) + else: + inspect_eval_arguments = {} + + if "working_limit" in data: + inspect_eval_arguments["working_limit"] = data["working_limit"] + + # Parse sandbox config with alias support + sandbox_raw = data.get("sandbox") or data.get("sandbox_type") sandbox = None if isinstance(sandbox_raw, dict): sandbox = sandbox_raw @@ -508,19 +524,12 @@ def parse_job(job_path: str, dataset_root: str) -> Job: variant_file = os.path.normpath(os.path.join(job_dir, str(rel_path))) if not os.path.isfile(variant_file): raise FileNotFoundError( - f"Variant file not found: {variant_file} " - f"(referenced from {job_path})" + f"Variant file not found: {variant_file} (referenced from {job_path})" ) file_data = _read_yaml_file(variant_file) for vname, vdef in file_data.items(): variants[str(vname)] = dict(vdef) if isinstance(vdef, dict) else {} - # Parse inspect_eval_arguments - inspect_eval_arguments = data.get("inspect_eval_arguments") - if isinstance(inspect_eval_arguments, dict): - inspect_eval_arguments = dict(inspect_eval_arguments) - else: - inspect_eval_arguments = None # Parse models (required) models_raw = data.get("models") @@ -562,8 +571,7 @@ def find_job_file(dataset_root: str, job: str) -> str: jobs_dir = os.path.join(dataset_root, "jobs") if not os.path.isdir(jobs_dir): raise FileNotFoundError( - "Jobs directory not found. " - "Create it or specify a full path to the job file." + "Jobs directory not found. Create it or specify a full path to the job file." ) with_ext = os.path.join(jobs_dir, f"{job}.yaml") @@ -575,11 +583,8 @@ def find_job_file(dataset_root: str, job: str) -> str: return os.path.normpath(without_ext) available = [ - os.path.splitext(f)[0] - for f in sorted(os.listdir(jobs_dir)) - if f.endswith(".yaml") + os.path.splitext(f)[0] for f in sorted(os.listdir(jobs_dir)) if f.endswith(".yaml") ] raise FileNotFoundError( - f"Job '{job}' not found in {jobs_dir}. " - f"Available jobs: {available or '(none)'}" + f"Job '{job}' not found in {jobs_dir}. Available jobs: {available or '(none)'}" ) diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py index 21469c3..1160782 100644 --- a/packages/dataset_config_python/src/dataset_config_python/resolver.py +++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py @@ -18,7 +18,6 @@ from dataset_config_python.models.variant import Variant from dataset_config_python.parser import ParsedTask, find_job_file, parse_job, parse_tasks - # Default sandbox configurations for Flutter evaluations. # Consumers can pass these to resolve() or provide their own. DEFAULT_SANDBOX_REGISTRY: dict[str, dict[str, str]] = { @@ -365,8 +364,6 @@ def _get(key: str, default: Any = None) -> Any: ) - - # --------------------------------------------------------------------------- # Sandbox resolution # --------------------------------------------------------------------------- @@ -462,7 +459,7 @@ def _expand_task_configs( # Create one ParsedTask per effective variant for vname, vdef in effective_variants.items(): - variant = _resolve_variant(vname, vdef, dataset_root) + variant = _resolve_variant(vname, vdef, dataset_root, task_id) examples_dir = None if job.save_examples: @@ -492,6 +489,7 @@ def _resolve_variant( name: str, vdef: dict[str, Any], dataset_root: str, + task_id: str, ) -> Variant: """Resolve a variant dict into a fully-resolved Variant.""" if not vdef: @@ -501,6 +499,7 @@ def _resolve_variant( context_files: list[ContextFile] = [] cf_paths: list[str] = vdef.get("files") or [] for cf_path in cf_paths: + cf_path = cf_path.replace("{task_id}", task_id) if _is_glob(cf_path): full_pattern = os.path.join(dataset_root, cf_path) matched = sorted( @@ -521,6 +520,7 @@ def _resolve_variant( skill_paths: list[str] = [] raw_skills: list[str] = vdef.get("skills") or [] for skill_path_str in raw_skills: + skill_path_str = skill_path_str.replace("{task_id}", task_id) if _is_glob(skill_path_str): full_pattern = os.path.join(dataset_root, skill_path_str) matched_dirs = sorted( diff --git a/packages/dataset_config_python/tests/test_config.py b/packages/dataset_config_python/tests/test_config.py index 865b7bd..f2e911f 100644 --- a/packages/dataset_config_python/tests/test_config.py +++ b/packages/dataset_config_python/tests/test_config.py @@ -77,7 +77,7 @@ def dataset_dir(tmp_path): jobs_dir.mkdir() job_yaml = jobs_dir / "local_dev.yaml" job_yaml.write_text( - """logs_dir: ./logs + """log_dir: ./logs max_connections: 5 models: - google/gemini-2.5-flash @@ -129,7 +129,7 @@ def dataset_dir_with_sample_files(tmp_path): jobs_dir.mkdir() (jobs_dir / "default.yaml").write_text( """ -logs_dir: ./logs +log_dir: ./logs models: - test/model """ @@ -315,7 +315,7 @@ def test_parse_job_missing_models(self, tmp_path): jobs_dir.mkdir() (jobs_dir / "bad.yaml").write_text( """\ -logs_dir: ./logs +log_dir: ./logs """ ) job_path = str(jobs_dir / "bad.yaml") diff --git a/packages/devals_cli/example/evals/jobs/local_dev.yaml b/packages/devals_cli/example/evals/jobs/local_dev.yaml index 06c9eb8..4354154 100644 --- a/packages/devals_cli/example/evals/jobs/local_dev.yaml +++ b/packages/devals_cli/example/evals/jobs/local_dev.yaml @@ -18,7 +18,7 @@ # Directory for evaluation logs (relative to dataset root) # A timestamped subdirectory is created automatically for each run. -# logs_dir: ../logs +# log_dir: ../logs # Sandbox environment: "local", "docker", or "podman" # - local: Run directly on host (fastest, no isolation) diff --git a/packages/devals_cli/lib/src/commands/doctor_command.dart b/packages/devals_cli/lib/src/commands/doctor_command.dart index c597388..7d60b85 100644 --- a/packages/devals_cli/lib/src/commands/doctor_command.dart +++ b/packages/devals_cli/lib/src/commands/doctor_command.dart @@ -3,6 +3,7 @@ import 'dart:io'; import 'package:args/command_runner.dart'; import 'package:devals/src/utils/env.dart'; import 'package:devals/src/utils/expand_home_dir.dart'; +import 'package:devals/src/utils/process_utils.dart'; import 'package:howdy/howdy.dart'; /// The result status of a single doctor check. @@ -51,7 +52,7 @@ typedef ProcessRunner = /// for the CLI, dash_evals, and eval_explorer. class DoctorCommand extends Command { DoctorCommand({ProcessRunner? processRunner}) - : _runProcess = processRunner ?? Process.run; + : _runProcess = processRunner ?? runVenvProcess; final ProcessRunner _runProcess; diff --git a/packages/devals_cli/lib/src/commands/run_command.dart b/packages/devals_cli/lib/src/commands/run_command.dart index a9fde02..69fef36 100644 --- a/packages/devals_cli/lib/src/commands/run_command.dart +++ b/packages/devals_cli/lib/src/commands/run_command.dart @@ -4,6 +4,7 @@ import 'package:args/command_runner.dart'; import 'package:dataset_config_dart/dataset_config_dart.dart'; import 'package:devals/src/dataset/dry_run.dart'; import 'package:devals/src/dataset/filesystem_utils.dart'; +import 'package:devals/src/utils/process_utils.dart'; import 'package:howdy/howdy.dart'; import 'package:path/path.dart' as p; @@ -71,7 +72,7 @@ class RunCommand extends Command { // Use inheritStdio to preserve inspect-ai's interactive terminal display try { - final process = await Process.start( + final process = await startVenvProcess( 'run-evals', ['--json', evalSetPath], mode: ProcessStartMode.inheritStdio, diff --git a/packages/devals_cli/lib/src/dataset/dataset_reader.dart b/packages/devals_cli/lib/src/dataset/dataset_reader.dart index 9401996..26f3889 100644 --- a/packages/devals_cli/lib/src/dataset/dataset_reader.dart +++ b/packages/devals_cli/lib/src/dataset/dataset_reader.dart @@ -39,15 +39,19 @@ class DatasetReader { } final taskNames = []; - for (final entity in tasksDir.listSync()) { - if (entity is Directory) { - final taskFile = File(p.join(entity.path, 'task.yaml')); - if (taskFile.existsSync()) { - taskNames.add(p.basename(entity.path)); - } - } + // Recursive search for task.yaml files + final taskFiles = tasksDir + .listSync(recursive: true) + .whereType() + .where((f) => p.basename(f.path) == 'task.yaml') + .toList() + ..sort((a, b) => a.path.compareTo(b.path)); + + for (final taskFile in taskFiles) { + // The task name is the parent directory name of task.yaml + taskNames.add(p.basename(taskFile.parent.path)); } - taskNames.sort(); + return taskNames; } diff --git a/packages/devals_cli/lib/src/dataset/file_templates/init_templates/init_job_template.dart b/packages/devals_cli/lib/src/dataset/file_templates/init_templates/init_job_template.dart index e4fbb72..fe7f749 100644 --- a/packages/devals_cli/lib/src/dataset/file_templates/init_templates/init_job_template.dart +++ b/packages/devals_cli/lib/src/dataset/file_templates/init_templates/init_job_template.dart @@ -28,7 +28,7 @@ String initJobTemplate({ # Directory for evaluation logs (relative to dataset root) # A timestamped subdirectory is created automatically for each run. -# logs_dir: ../logs +# log_dir: ../logs # Sandbox environment: "local", "docker", or "podman" # - local: Run directly on host (fastest, no isolation) diff --git a/packages/devals_cli/lib/src/dataset/file_templates/job_template.dart b/packages/devals_cli/lib/src/dataset/file_templates/job_template.dart index 32b3ed6..728c6a4 100644 --- a/packages/devals_cli/lib/src/dataset/file_templates/job_template.dart +++ b/packages/devals_cli/lib/src/dataset/file_templates/job_template.dart @@ -45,7 +45,7 @@ String jobTemplate({ # Directory for evaluation logs (relative to dataset root) # A timestamped subdirectory is created automatically for each run. -# logs_dir: ../logs +# log_dir: ../logs # Sandbox environment: "local", "docker", or "podman" # - local: Run directly on host (fastest, no isolation) diff --git a/packages/devals_cli/lib/src/utils/process_utils.dart b/packages/devals_cli/lib/src/utils/process_utils.dart new file mode 100644 index 0000000..9918090 --- /dev/null +++ b/packages/devals_cli/lib/src/utils/process_utils.dart @@ -0,0 +1,87 @@ +import 'dart:io'; +import 'package:path/path.dart' as p; + +/// Finds the local .venv/bin directory relative to the project root. +String? findLocalVenvBin() { + try { + var dir = Directory.current.absolute; + // Walk up to find devals.yaml and .venv + for (var i = 0; i < 10; i++) { + if (File(p.join(dir.path, 'devals.yaml')).existsSync() && + Directory(p.join(dir.path, '.venv')).existsSync()) { + final venvSubdir = Platform.isWindows ? 'Scripts' : 'bin'; + return p.join(dir.path, '.venv', venvSubdir); + } + final parent = dir.parent; + if (parent.path == dir.path) break; + dir = parent; + } + } catch (_) {} + return null; +} + +/// Runs a process, accounting for a local .venv if present. +Future runVenvProcess( + String executable, + List arguments, { + String? workingDirectory, + Map? environment, +}) async { + final venvBin = findLocalVenvBin(); + final env = Map.from(environment ?? Platform.environment); + + String resolvedExecutable = executable; + if (venvBin != null) { + final venvExe = p.join(venvBin, executable); + if (File(venvExe).existsSync()) { + resolvedExecutable = venvExe; + } + + // Also update PATH to ensure sub-processes find other tools in the venv + final pathKey = Platform.isWindows ? 'Path' : 'PATH'; + final separator = Platform.isWindows ? ';' : ':'; + final currentPath = env[pathKey] ?? ''; + env[pathKey] = '$venvBin$separator$currentPath'; + } + + return Process.run( + resolvedExecutable, + arguments, + workingDirectory: workingDirectory, + environment: env, + ); +} + +/// Starts a process, accounting for a local .venv if present. +Future startVenvProcess( + String executable, + List arguments, { + String? workingDirectory, + Map? environment, + ProcessStartMode mode = ProcessStartMode.normal, +}) async { + final venvBin = findLocalVenvBin(); + final env = Map.from(environment ?? Platform.environment); + + String resolvedExecutable = executable; + if (venvBin != null) { + final venvExe = p.join(venvBin, executable); + if (File(venvExe).existsSync()) { + resolvedExecutable = venvExe; + } + + // Also update PATH to ensure sub-processes find other tools in the venv + final pathKey = Platform.isWindows ? 'Path' : 'PATH'; + final separator = Platform.isWindows ? ';' : ':'; + final currentPath = env[pathKey] ?? ''; + env[pathKey] = '$venvBin$separator$currentPath'; + } + + return Process.start( + resolvedExecutable, + arguments, + workingDirectory: workingDirectory, + environment: env, + mode: mode, + ); +} diff --git a/tool/config_parity/fixtures/basic/jobs/local_dev.yaml b/tool/config_parity/fixtures/basic/jobs/local_dev.yaml index 0c5beca..959c248 100644 --- a/tool/config_parity/fixtures/basic/jobs/local_dev.yaml +++ b/tool/config_parity/fixtures/basic/jobs/local_dev.yaml @@ -1,4 +1,4 @@ -logs_dir: ./logs +log_dir: ./logs sandbox_type: local max_connections: 5 models: diff --git a/tool/config_parity/fixtures/multi_variant/jobs/dev.yaml b/tool/config_parity/fixtures/multi_variant/jobs/dev.yaml index 5ec75d4..d31b380 100644 --- a/tool/config_parity/fixtures/multi_variant/jobs/dev.yaml +++ b/tool/config_parity/fixtures/multi_variant/jobs/dev.yaml @@ -1,4 +1,4 @@ -logs_dir: ./logs +log_dir: ./logs sandbox_type: local models: - google/gemini-2.5-flash