Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .github/workflows/claude-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ on:
required: false
default: false
type: boolean
al-lsp:
description: "Enable AL LSP server"
required: false
default: false
type: boolean
repeat:
description: "Number of times to run sequentially (ignored for test runs)"
required: false
Expand Down Expand Up @@ -100,9 +105,9 @@ jobs:
node-version: 24

- name: Install AL Tool
if: ${{ inputs.al-mcp }}
if: ${{ inputs.al-mcp || inputs.al-lsp }}
run: |
dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 17.0.33.55542
dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 18.0.36.64936-beta
echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH

- name: Install Claude Code
Expand All @@ -120,7 +125,8 @@ jobs:
--category "${{ inputs.category }}" `
--repo-path "${{ steps.setup-env.outputs.repo_path }}" `
--output-dir "${{ env.EVALUATION_RESULTS_DIR }}" `
${{ inputs.al-mcp && '--al-mcp' || '' }}
${{ inputs.al-mcp && '--al-mcp' || '' }} `
${{ inputs.al-lsp && '--al-lsp' || '' }}

- name: Upload evaluation results
uses: actions/upload-artifact@v6
Expand Down Expand Up @@ -155,4 +161,4 @@ jobs:
workflow-file: claude-evaluation.yml
repeat: ${{ inputs.repeat }}
workflow-inputs: |
{"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}
{"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}", "al-lsp": "${{ inputs.al-lsp }}"}
2 changes: 1 addition & 1 deletion EXPERIMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Trigger the evaluation workflow from the **Actions** tab:

- **Workflow:** `Evaluation with GitHub Copilot` or `Evaluation with Claude Code`
- **`test-run`:** `true` (default — runs 4 entries, ~10 min)
- **`model`**, **`category`**, **`al-mcp`**: as needed
- **`model`**, **`category`**, **`al-mcp`**, **`al-lsp`**: as needed

This catches configuration mistakes cheaply. Do not skip it.

Expand Down
15 changes: 13 additions & 2 deletions src/bcbench/agent/claude/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import yaml

from bcbench.agent.claude.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand All @@ -19,7 +19,14 @@


def run_claude_code(
entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
entry: BaseDatasetEntry,
model: str,
category: EvaluationCategory,
repo_path: Path,
output_dir: Path,
al_mcp: bool = False,
al_lsp: bool = False,
container_name: str = "bcbench",
) -> tuple[AgentMetrics | None, ExperimentConfiguration]:
"""Run Claude Code on a single dataset entry.

Expand All @@ -33,12 +40,14 @@ def run_claude_code(

prompt: str = build_prompt(entry, repo_path, claude_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(claude_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.CLAUDE, al_lsp=al_lsp, container_name=container_name)
instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
custom_agent=custom_agent,
Expand All @@ -65,6 +74,8 @@ def run_claude_code(
]
if mcp_config_json:
cmd_args.append(f"--mcp-config={mcp_config_json}")
if lsp_plugin_dir is not None:
cmd_args.append(f"--plugin-dir={lsp_plugin_dir}")
if custom_agent:
cmd_args.append(f"--agent={custom_agent}")
cmd_args.extend(
Expand Down
8 changes: 5 additions & 3 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import yaml

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_lsp_config, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.agent.shared import build_al_lsp_plugin, build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
Expand Down Expand Up @@ -42,14 +42,14 @@ def run_copilot_agent(

prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp, container_name=container_name)
al_lsp_enabled: bool = build_lsp_config(entry, category, repo_path, al_lsp=al_lsp, container_name=container_name)
lsp_plugin_dir: Path | None = build_al_lsp_plugin(entry, category, repo_path, AgentType.COPILOT, al_lsp=al_lsp, container_name=container_name)
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
al_lsp_enabled=al_lsp_enabled,
al_lsp_enabled=lsp_plugin_dir is not None,
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
custom_agent=custom_agent,
Expand All @@ -76,6 +76,8 @@ def run_copilot_agent(
cmd_args.append("--no-custom-instructions")
if mcp_config_json:
cmd_args.append(f"--additional-mcp-config={mcp_config_json}")
if lsp_plugin_dir is not None:
cmd_args.append(f"--plugin-dir={lsp_plugin_dir}")
if custom_agent:
cmd_args.append(f"--agent={custom_agent}")

Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Shared code for CLI-based agents (Claude, Copilot)."""

from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
from bcbench.agent.shared.lsp import build_lsp_config
from bcbench.agent.shared.lsp import build_al_lsp_plugin
from bcbench.agent.shared.mcp import build_mcp_config
from bcbench.agent.shared.prompt import build_prompt

__all__ = ["build_lsp_config", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
__all__ = ["build_al_lsp_plugin", "build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
92 changes: 59 additions & 33 deletions src/bcbench/agent/shared/lsp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import shutil
from pathlib import Path

from bcbench.agent.shared.altool_paths import (
Expand All @@ -10,11 +11,16 @@
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError
from bcbench.logger import get_logger
from bcbench.types import EvaluationCategory
from bcbench.types import AgentType, EvaluationCategory

logger = get_logger(__name__)

_AL_LSP_RELATIVE_PATH = Path(".github") / "lsp.json"
# Per-task plugin folder location. Both Copilot CLI and Claude Code accept
# `--plugin-dir <path>` for ad-hoc plugin loading and both look for the
# manifest under `.claude-plugin/plugin.json`, so a single neutral path works
# for either agent. Lives under `.bcbench/` so it's visibly BC-Bench-owned
# and won't collide with either agent's auto-discovery paths.
_AL_LSP_PLUGIN_RELATIVE_PATH = Path(".bcbench") / "al-lsp-plugin"


def _resolve_symbol_paths(entry: BaseDatasetEntry, category: EvaluationCategory, container_name: str) -> tuple[list[str], list[str]]:
Expand Down Expand Up @@ -48,48 +54,68 @@ def _build_lsp_args(project_paths: list[str], package_cache_paths: list[str], as
return args


def build_lsp_config(entry: BaseDatasetEntry, category: EvaluationCategory, repo_path: Path, al_lsp: bool, container_name: str = "") -> bool:
"""Write Copilot's project-level LSP config to <repo_path>/.github/lsp.json.
def _lsp_config_for(agent_type: AgentType, args: list[str]) -> dict:
"""Build the agent-specific `.lsp.json` content.

When ``al_lsp=False``, removes any stale config left over from a previous run and returns False.
When True, writes the `lspServers.altool` entry pointing at `altool launchlspserver` and returns True.
Both agents launch the same `al launchlspserver` process — only the surrounding
LSP-routing schema differs:

- Copilot CLI expects `{ "lspServers": { name: { ..., "fileExtensions": {".ext": "lang"} } } }`
- Claude Code expects `{ name: { ..., "extensionToLanguage": {".ext": "lang"} } }` (no wrapper, different extension key)

`command: "al"` is unqualified by design: Copilot CLI silently rejects absolute paths in LSP
`command` ("Server <name> is configured but not available"), so the published `altool` wrapper
(`al`) must resolve via PATH on both sides.
"""
server = {"command": "al", "args": args}
match agent_type:
case AgentType.COPILOT:
return {"lspServers": {"altool": {**server, "fileExtensions": {".al": "al"}}}}
case AgentType.CLAUDE:
return {"altool": {**server, "extensionToLanguage": {".al": "al"}}}


def build_al_lsp_plugin(entry: BaseDatasetEntry, category: EvaluationCategory, repo_path: Path, agent_type: AgentType, al_lsp: bool, container_name: str = "") -> Path | None:
"""Build a per-task plugin folder containing the AL LSP server, return its path or None.

Both Copilot CLI and Claude Code load this via ``--plugin-dir <path>`` for a single session
— no marketplace registration, no global state, no cross-run plugin leakage. The plugin
folder layout is identical between agents; only the LSP-routing schema in ``.lsp.json``
differs (see :func:`_lsp_config_for`).

Layout written under ``<repo>/.bcbench/al-lsp-plugin/``::

.claude-plugin/plugin.json — minimal manifest (only ``name`` is required;
both CLIs check this path)
.lsp.json — LSP server config in the agent's schema

Returns the plugin folder path (to be passed as ``--plugin-dir``), or None when disabled.
"""
lsp_config_path = repo_path / _AL_LSP_RELATIVE_PATH
plugin_dir = repo_path / _AL_LSP_PLUGIN_RELATIVE_PATH

if not al_lsp:
if lsp_config_path.is_file():
lsp_config_path.unlink()
logger.info(f"Removed stale LSP config: {lsp_config_path}")
return False
if plugin_dir.exists():
shutil.rmtree(plugin_dir)
logger.info(f"Removed stale AL LSP plugin: {plugin_dir}")
return None

project_paths = [str(repo_path / p) for p in entry.project_paths]
set_runtime_version(project_paths)

package_cache_paths, assembly_probing_paths = _resolve_symbol_paths(entry, category, container_name)
args = _build_lsp_args(project_paths, package_cache_paths, assembly_probing_paths)

args = _build_lsp_args(
project_paths=project_paths,
package_cache_paths=package_cache_paths,
assembly_probing_paths=assembly_probing_paths,
)

# Copilot CLI resolves `command` via PATH (absolute paths are silently rejected with
# "Server <name> is configured but not available"). `al` is the published altool
# wrapper installed via the .NET tool — it must be on PATH.
lsp_config = {
"lspServers": {
"altool": {
"command": "al",
"args": args,
"fileExtensions": {".al": "al"},
}
}
plugin_manifest = {
"name": "al-lsp",
"version": "1.0.0",
"description": "AL Language Server for Business Central agentic development",
}
lsp_config = _lsp_config_for(agent_type, args)

lsp_config_path.parent.mkdir(parents=True, exist_ok=True)
lsp_config_path.write_text(json.dumps(lsp_config, indent=2), encoding="utf-8")
(plugin_dir / ".claude-plugin").mkdir(parents=True, exist_ok=True)
(plugin_dir / ".claude-plugin" / "plugin.json").write_text(json.dumps(plugin_manifest, indent=2), encoding="utf-8")
(plugin_dir / ".lsp.json").write_text(json.dumps(lsp_config, indent=2), encoding="utf-8")

logger.info(f"Wrote AL LSP config: {lsp_config_path}")
logger.info(f"Wrote AL LSP plugin for {agent_type.value}: {plugin_dir}")
logger.debug(f"LSP configuration: {json.dumps(lsp_config, indent=2)}")

return True
return plugin_dir
2 changes: 2 additions & 0 deletions src/bcbench/commands/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def evaluate_claude_code(
output_dir: OutputDir = _config.paths.evaluation_results_path,
run_id: RunId = "claude_code_test_run",
al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False,
al_lsp: Annotated[bool, typer.Option("--al-lsp", help="Enable AL LSP server")] = False,
) -> None:
"""
Evaluate Claude Code on single dataset entry.
Expand Down Expand Up @@ -139,6 +140,7 @@ def evaluate_claude_code(
model=ctx.model,
output_dir=ctx.result_dir,
al_mcp=al_mcp if ctx.container else False,
al_lsp=al_lsp,
container_name=ctx.get_container().name if ctx.container else "",
),
)
Expand Down
12 changes: 11 additions & 1 deletion src/bcbench/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def run_claude(
repo_path: RepoPath = _config.paths.testbed_path,
output_dir: OutputDir = _config.paths.evaluation_results_path,
al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False,
al_lsp: Annotated[bool, typer.Option("--al-lsp", help="Enable AL LSP server")] = False,
) -> None:
"""
Run Claude Code on a single entry to generate a patch (without building/testing).
Expand All @@ -77,4 +78,13 @@ def run_claude(
entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
category.pipeline.setup_workspace(entry, repo_path)

run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
run_claude_code(
entry=entry,
repo_path=repo_path,
model=model,
category=category,
output_dir=output_dir,
al_mcp=al_mcp if container_name else False,
al_lsp=al_lsp,
container_name=container_name or "",
)
Loading