Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/dash_evals/pyrefly.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Pyrefly configuration
# Tell Pyrefly to use the repo-root venv Python interpreter

python-interpreter = "../../.venv/bin/python"
python-interpreter = "../../../dash_evals/.venv/bin/python"
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,17 @@ async def _setup_local_workspace(state: TaskState) -> TaskState:
# absolute paths that are invalid after copying to a new location.
dep_cmd = state.metadata.get("dep_install_cmd", ["flutter", "pub", "get"])
sb = sandbox()

working_dir_arg = state.metadata.get("working_dir")
if working_dir_arg:
working_dir = workspace_copy / working_dir_arg
else:
working_dir = workspace_copy
state.metadata["working_dir"] = str(working_dir)

dep_result = await sb.exec(
dep_cmd,
cwd=str(workspace_copy),
cwd=str(working_dir),
)

if not dep_result.success:
Expand Down
2 changes: 2 additions & 0 deletions packages/dash_evals/src/dash_evals/runner/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .analyze_codebase import analyze_codebase
from .bug_fix import bug_fix, flutter_bug_fix
from .code_gen import code_gen, flutter_code_gen
from .mcp_coding_task import mcp_coding_task
from .mcp_tool import mcp_tool
from .question_answer import question_answer
from .skill_test import skill_test
Expand All @@ -11,6 +12,7 @@
"code_gen",
"flutter_bug_fix",
"flutter_code_gen",
"mcp_coding_task",
"mcp_tool",
"question_answer",
"skill_test",
Expand Down
75 changes: 75 additions & 0 deletions packages/dash_evals/src/dash_evals/runner/tasks/mcp_coding_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Tests the agent's ability to use the Dart MCP server"""

from inspect_ai import Task, task
from inspect_ai.dataset import Dataset
from inspect_ai.model import ChatMessageSystem
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import Generate, Solver, TaskState, solver

from dash_evals.runner.scorers import export_workspace, mcp_tool_usage
from dash_evals.runner.solvers import setup_workspace

from .task_helpers import (
append_context_injection,
append_model_interaction,
build_task_metadata,
)


@solver
def _add_working_dir_system_message() -> Solver:
"""Adds a dynamic system message with the working directory."""

async def solve(state: TaskState, generate: Generate) -> TaskState:
working_dir = state.metadata.get("working_dir", "")
host_workspace = state.metadata.get("host_workspace")

if host_workspace:
# Container sandbox
current_dir = f"/workspace/{working_dir}"
else:
# Local sandbox
current_dir = working_dir

message = f"""
You are an expert Dart and Flutter developer. Use all the tools available to
you to accomplish the task and ensure the result is free of errors.

The current project directory is {current_dir}

For MCP tools, use the following root path:
file://{current_dir}
"""
state.messages.insert(0, ChatMessageSystem(content=message.strip()))
return state

return solve


@task
def mcp_coding_task(dataset: Dataset, config: dict) -> Task:
"""
Tests the agent's ability to use the Dart MCP server for generic coding tasks.

Args:
dataset: Inspect dataset loaded from JSONL.
config: Task configuration containing dataset, context, and variant.
"""
solver_chain = [_add_working_dir_system_message()]

append_context_injection(solver_chain, config)
append_model_interaction(solver_chain, config)

scorers: list = [model_graded_fact(), mcp_tool_usage()]
if config.get("save_examples"):
scorers.append(export_workspace())

return Task(
name=config["task_name"],
dataset=dataset,
setup=[setup_workspace()],
solver=solver_chain,
scorer=scorers,
time_limit=300,
metadata=build_task_metadata(config),
)
4 changes: 3 additions & 1 deletion packages/dash_evals/src/dash_evals/runner/tasks/mcp_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def mcp_tool(dataset: Dataset, config: dict) -> Task:
- system_message: custom system prompt (optional)
"""
required_tools = config.get("required_tools", [])
inject_temp_dir = config.get("inject_temp_dir", False)
# inject_temp_dir can be set via task.yaml metadata.task_parameters
task_params = (config.get("metadata") or {}).get("task_parameters") or {}
inject_temp_dir = config.get("inject_temp_dir", False) or task_params.get("inject_temp_dir", False)

# Pre-process samples if temp directory injection is needed
active_dataset = dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def _build_solver_chain(config: dict, system_message: str) -> list:
# Build tools list — skill tool is required for this task type
skill_tool = get_skill_tool(config)

tools = [bash(timeout=120)]
tools: list = []
# bash() requires a real sandbox (Docker/Podman), skip for local runs
sandbox_type = config.get("sandbox_type", "local")
if sandbox_type != "local":
tools.append(bash(timeout=120))
if skill_tool is not None:
tools.append(skill_tool)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ def append_model_interaction(

if mcp_servers_config:
sandbox_type = config.get("sandbox_type", "local")
tools.extend(create_mcp_servers(mcp_servers_config, sandbox_type))
tools.extend(cast(list[Tool | MCPServer], create_mcp_servers(mcp_servers_config, sandbox_type)))

skill_tool = get_skill_tool(config)
if skill_tool:
tools.append(skill_tool)
tools.append(cast(Tool | MCPServer, skill_tool))

if extra_tools:
tools.extend(extra_tools)
Expand Down
100 changes: 53 additions & 47 deletions packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@ class YamlParser extends Parser {

final taskConfigs = <ParsedTask>[];

final taskDirs = tasksDir.listSync().whereType<Directory>().toList()
..sort((a, b) => a.path.compareTo(b.path));
// Recursive search for task.yaml files
final taskFiles = tasksDir
.listSync(recursive: true)
.whereType<File>()
.where((f) => p.basename(f.path) == 'task.yaml')
.toList();

for (final taskDir in taskDirs) {
final taskFile = File(p.join(taskDir.path, 'task.yaml'));
if (taskFile.existsSync()) {
taskConfigs.addAll(_loadTaskFile(taskFile.path, datasetRoot));
}
for (final taskFile in taskFiles) {
taskConfigs.addAll(_loadTaskFile(taskFile.path, datasetRoot));
}

return taskConfigs;
return taskConfigs..sort((a, b) => a.id.compareTo(b.id));
}

/// Load a single task.yaml file into a [ParsedTask].
Expand Down Expand Up @@ -75,8 +76,9 @@ class YamlParser extends Parser {
} else {
final datasetMap = Map<String, dynamic>.from(datasetRaw);
final formatKeys = {'samples', 'json', 'csv'};
final presentKeys =
formatKeys.intersection(datasetMap.keys.toSet().cast<String>());
final presentKeys = formatKeys.intersection(
datasetMap.keys.toSet().cast<String>(),
);
if (presentKeys.length > 1) {
throw FormatException(
"Task '$taskId': 'dataset' must have exactly one of "
Expand Down Expand Up @@ -142,7 +144,11 @@ class YamlParser extends Parser {
final earlyStopping = taskArgs['early_stopping'];
final displayName = data['display_name'] as String?;
final version = data['version'];
final taskMetadata = _asMap(data['metadata']);
final taskMetadata = <String, dynamic>{
if (data.containsKey('workspace')) 'workspace': data['workspace'],
if (data.containsKey('working_dir')) 'working_dir': data['working_dir'],
...?_asMap(data['metadata']),
};
Comment thread
ericwindmill marked this conversation as resolved.
final sandboxParameters = _asMap(data['sandbox_parameters']);

return [
Expand Down Expand Up @@ -353,7 +359,7 @@ class YamlParser extends Parser {

final data = readYamlFileAsMap(jobPath);

final logsDir = (data['logs_dir'] as String?) ?? _kDefaultLogsDir;
final logsDir = (data['log_dir'] as String?) ?? _kDefaultLogsDir;
final maxConnections = (data['max_connections'] as int?) ?? 10;

// Resolve log directory with timestamp
Expand Down Expand Up @@ -414,6 +420,12 @@ class YamlParser extends Parser {
}
final models = modelsRaw.cast<String>();

final inspectEvalArgs =
_asMap(data['inspect_eval_arguments']) ?? <String, dynamic>{};
if (data.containsKey('working_limit')) {
inspectEvalArgs['working_limit'] = data['working_limit'];
}

return Job(
logDir: logDir,
maxConnections: maxConnections,
Expand All @@ -426,9 +438,9 @@ class YamlParser extends Parser {
sampleFilters: sampleFilters,
saveExamples: data['save_examples'] == true,
// Sandbox configuration
sandbox: _parseSandbox(data['sandbox']),
sandbox: _parseSandbox(data['sandbox'] ?? data['sandbox_type']),
// All inspect eval arguments
inspectEvalArguments: _asMap(data['inspect_eval_arguments']),
inspectEvalArguments: inspectEvalArgs,
);
}

Expand Down Expand Up @@ -472,9 +484,6 @@ class YamlParser extends Parser {
return null;
}




// ------------------------------------------------------------------
// Log dir helpers
// ------------------------------------------------------------------
Expand Down Expand Up @@ -522,42 +531,39 @@ class YamlParser extends Parser {
///
/// Throws [FileSystemException] if the job file is not found.
String findJobFile(String datasetRoot, String job) {
// Check if it's a path (contains / or ends with .yaml)
final jobsDir = Directory(p.join(datasetRoot, 'jobs'));

// 1. Try relative to jobs/ directory
if (jobsDir.existsSync()) {
// Try literally (e.g. "skills/skill.yaml")
final path1 = p.join(jobsDir.path, job);
if (File(path1).existsSync()) return p.normalize(path1);

// Try with .yaml extension (e.g. "skills/skill" -> "skills/skill.yaml")
final path2 = '$path1.yaml';
if (File(path2).existsSync()) return p.normalize(path2);
}

// 2. Try as absolute or relative to dataset root
// (only if it contains a slash or ends in .yaml to avoid ambiguous discovery)
if (job.contains('/') || job.endsWith('.yaml')) {
final jobPath = p.isAbsolute(job) ? job : p.join(datasetRoot, job);
if (!File(jobPath).existsSync()) {
throw FileSystemException('Job file not found', jobPath);
}
return p.normalize(jobPath);
if (File(jobPath).existsSync()) return p.normalize(jobPath);
}

// Look in jobs/ directory
final jobsDir = Directory(p.join(datasetRoot, 'jobs'));
if (!jobsDir.existsSync()) {
throw FileSystemException(
'Jobs directory not found. '
'Create it or specify a full path to the job file.',
jobsDir.path,
);
// List available jobs for helpful error message (top-level only for now)
var available = <String>[];
if (jobsDir.existsSync()) {
available = jobsDir
.listSync()
.whereType<File>()
.where((f) => f.path.endsWith('.yaml'))
.map((f) => p.basenameWithoutExtension(f.path))
.toList();
}

// Try with .yaml extension
final withExt = File(p.join(jobsDir.path, '$job.yaml'));
if (withExt.existsSync()) return p.normalize(withExt.path);

// Try without extension (maybe they included it)
final withoutExt = File(p.join(jobsDir.path, job));
if (withoutExt.existsSync()) return p.normalize(withoutExt.path);

// List available jobs for helpful error message
final available = jobsDir
.listSync()
.whereType<File>()
.where((f) => f.path.endsWith('.yaml'))
.map((f) => p.basenameWithoutExtension(f.path))
.toList();
throw FileSystemException(
"Job '$job' not found in ${jobsDir.path}. "
'Available jobs: ${available.isEmpty ? '(none)' : available}',
"Job '$job' not found. Checked 'jobs/' and dataset root. "
'Available top-level jobs: ${available.isEmpty ? '(none)' : available}',
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,12 @@ class EvalSetResolver {

// Create one ParsedTask per effective variant
for (final entry in effectiveVariants.entries) {
final variant = _resolveVariant(entry.key, entry.value, datasetRoot);
final variant = _resolveVariant(
entry.key,
entry.value,
datasetRoot,
taskId,
);

// Compute examples_dir from job log_dir
String? examplesDir;
Expand Down Expand Up @@ -489,14 +494,15 @@ class EvalSetResolver {
String name,
Map<String, dynamic> vDef,
String datasetRoot,
String taskId,
) {
if (vDef.isEmpty) return Variant(name: name);

// Load context files (with glob support)
final files = <ContextFile>[];
final cfPaths =
(vDef['files'] as List?)?.cast<String>() ?? const [];
for (final cfPath in cfPaths) {
final cfPaths = (vDef['files'] as List?)?.cast<String>() ?? const [];
for (var cfPath in cfPaths) {
cfPath = cfPath.replaceAll('{task_id}', taskId);
if (_isGlob(cfPath)) {
final matched = _expandGlobFiles(datasetRoot, cfPath);
if (matched.isEmpty) {
Expand All @@ -515,9 +521,9 @@ class EvalSetResolver {

// Resolve skill paths (with glob support)
final skills = <String>[];
final rawSkills =
(vDef['skills'] as List?)?.cast<String>() ?? const [];
for (final skillPathStr in rawSkills) {
final rawSkills = (vDef['skills'] as List?)?.cast<String>() ?? const [];
for (var skillPathStr in rawSkills) {
skillPathStr = skillPathStr.replaceAll('{task_id}', taskId);
if (_isGlob(skillPathStr)) {
final matched = _expandGlobDirs(datasetRoot, skillPathStr);
final validDirs = matched
Expand Down
Loading
Loading