EntityProcess
diff --git a/‎apps/web/src/content/docs/evaluation/eval-cases.mdx‎
Lines changed: 8 additions & 8 deletions b/‎apps/web/src/content/docs/evaluation/eval-cases.mdx‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎apps/web/src/content/docs/evaluation/eval-files.mdx‎
Lines changed: 1 addition & 1 deletion b/‎apps/web/src/content/docs/evaluation/eval-files.mdx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/web/src/content/docs/targets/configuration.mdx‎
Lines changed: 21 additions & 18 deletions b/‎apps/web/src/content/docs/targets/configuration.mdx‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎examples/showcase/cross-repo-sync/.agentv/targets.yaml‎
Lines changed: 5 additions & 0 deletions b/‎examples/showcase/cross-repo-sync/.agentv/targets.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/showcase/cross-repo-sync/README.md‎
Lines changed: 49 additions & 0 deletions b/‎examples/showcase/cross-repo-sync/README.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎examples/showcase/cross-repo-sync/bun.lock‎
Lines changed: 80 additions & 0 deletions b/‎examples/showcase/cross-repo-sync/bun.lock‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/showcase/cross-repo-sync/evals/dataset.eval.yaml‎
Lines changed: 82 additions & 0 deletions b/‎examples/showcase/cross-repo-sync/evals/dataset.eval.yaml‎
Lines changed: 82 additions & 0 deletions
@@ -29,7 +29,7 @@ tests:
 | `expected_output` | No | Expected response for comparison (string, object, or message array). Alias: `expected_output` |
 | `execution` | No | Per-case execution overrides (for example `target`, `skip_defaults`) |
 | `workspace` | No | Per-case workspace config (overrides suite-level) |
-| `metadata` | No | Arbitrary key-value pairs passed to setup/teardown scripts |
+| `metadata` | No | Arbitrary key-value pairs passed to lifecycle scripts |
 | `rubrics` | No | Structured evaluation criteria |
 | `assert` | No | Per-test evaluators |
 | `sidecar` | No | Additional metadata passed to evaluators |
@@ -117,28 +117,28 @@ Override the suite-level workspace config for individual tests. Test-level field
 
 ```yaml
 workspace:
-  setup:
+  before_all:
     script: ["bun", "run", "default-setup.ts"]
 
 tests:
   - id: case-1
     criteria: Should work
     input: Do something
     workspace:
-      setup:
+      before_all:
         script: ["bun", "run", "custom-setup.ts"]
 
   - id: case-2
     criteria: Should also work
     input: Do something else
-    # Inherits suite-level setup
+    # Inherits suite-level before_all
 ```
 
-See [Workspace Setup/Teardown](/targets/configuration/#workspace-setupteardown) for the full workspace config reference.
+See [Workspace Lifecycle Hooks](/targets/configuration/#workspace-lifecycle-hooks) for the full workspace config reference.
 
 ## Per-Case Metadata
 
-Pass arbitrary key-value pairs to setup/teardown scripts via the `metadata` field. This is useful for benchmark datasets where each case needs repo info, commit hashes, or other context:
+Pass arbitrary key-value pairs to lifecycle scripts via the `metadata` field. This is useful for benchmark datasets where each case needs repo info, commit hashes, or other context:
 
 ```yaml
 tests:
@@ -149,11 +149,11 @@ tests:
       repo: sympy/sympy
       base_commit: "abc123def"
     workspace:
-      setup:
+      before_all:
         script: ["python", "checkout_repo.py"]
 ```
 
-The `metadata` field is included in the stdin JSON passed to setup and teardown scripts as `case_metadata`.
+The `metadata` field is included in the stdin JSON passed to lifecycle scripts as `case_metadata`.
 
 ## Per-Test Assertions
 
 
@@ -35,7 +35,7 @@ tests:
 | `description` | Human-readable description of the evaluation |
 | `dataset` | Optional dataset identifier |
 | `execution` | Default execution config (for example `target`) |
-| `workspace` | Suite-level workspace config (setup/teardown scripts, template) |
+| `workspace` | Suite-level workspace config (lifecycle hooks, template) |
 | `tests` | Array of individual tests, or a string path to an external file |
 | `assert` | Suite-level evaluators appended to each test unless `execution.skip_defaults: true` is set on the test |
 
 
@@ -99,47 +99,54 @@ targets:
 ```
 
 When `workspace_template` is set:
-- The template directory is copied to `~/.agentv/workspaces/<eval-run-id>/<test-id>/`
+- The template directory is copied to `~/.agentv/workspaces/<eval-run-id>/shared/`
 - The `.git` directory is skipped during copy
-- Each test gets its own isolated copy
+- Tests share the workspace; use `after_each` to reset state between tests
 
-### Workspace Setup/Teardown
+### Workspace Lifecycle Hooks
 
-Run scripts before and after each test using the `workspace` block. This can be defined at the suite level (applies to all tests) or per test (overrides suite-level).
+Run scripts at different points in the evaluation lifecycle using the `workspace` block. This can be defined at the suite level (applies to all tests) or per test (overrides suite-level).
 
 ```yaml
 workspace:
   template: ./workspace-templates/my-project
-  setup:
+  before_all:
     script: ["bun", "run", "setup.ts"]
     timeout_ms: 120000
     cwd: ./scripts
-  teardown:
-    script: ["bun", "run", "teardown.ts"]
+  after_each:
+    script: ["bun", "run", "reset.ts"]
+    timeout_ms: 5000
+  after_all:
+    script: ["bun", "run", "cleanup.ts"]
     timeout_ms: 30000
 ```
 
 | Field | Description |
 |-------|-------------|
 | `template` | Directory to copy as workspace (alternative to target-level `workspace_template`) |
-| `setup` | Script to run after workspace creation, before the agent runs |
-| `teardown` | Script to run after evaluation, before cleanup |
+| `before_all` | Runs once after workspace creation, before the first test |
+| `after_all` | Runs once after the last test, before cleanup |
+| `before_each` | Runs before each test |
+| `after_each` | Runs after each test (e.g., reset workspace state for reuse) |
 
 Each script config accepts:
 
 | Field | Description |
 |-------|-------------|
 | `script` | Command array (e.g., `["bun", "run", "setup.ts"]`) |
-| `timeout_ms` | Timeout in milliseconds (default: 60000 for setup, 30000 for teardown) |
+| `timeout_ms` | Timeout in milliseconds (default: 60000 for `before_all`, 30000 for others) |
 | `cwd` | Working directory (relative paths resolved against eval file directory) |
 
-**Lifecycle order:** template copy → setup script → git baseline → agent runs → file changes captured → teardown script → cleanup
+**Lifecycle order:** template copy → `before_all` → git baseline → (`before_each` → agent runs → file changes captured → `after_each`) × N tests → `after_all` → cleanup
+
+**Shared workspace:** The workspace is created once and shared across all tests in a suite. Use `after_each` to reset state between tests (e.g., `git checkout . && git clean -fd`).
 
 **Error handling:**
-- Setup failure aborts the test with an error result
-- Teardown failure is non-fatal (warning only)
+- `before_all` / `before_each` failure aborts the test with an error result
+- `after_all` / `after_each` failure is non-fatal (warning only)
 
-**Script context:** Both scripts receive a JSON object on stdin with case context:
+**Script context:** All scripts receive a JSON object on stdin with case context:
 
 ```json
 {
@@ -153,10 +160,6 @@ Each script config accepts:
 
 **Suite vs per-test:** When both are defined, test-level fields replace suite-level fields. See [Per-Test Workspace Config](/evaluation/eval-cases/#per-case-workspace-config) for examples.
 
-### Workspace Fingerprinting
-
-After setup and git baseline initialization, AgentV computes a SHA-256 fingerprint of the workspace file tree. This fingerprint is included in the evaluation result as `workspaceFingerprint` and can be used to verify that workspaces are reproducible across runs.
-
 ### Cleanup Behavior
 
 By default:
 
@@ -0,0 +1,5 @@
+targets:
+  - name: mock_agent
+    provider: cli
+    command_template: bash mock-agent.sh {PROMPT} {OUTPUT_FILE}
+    timeout_seconds: 30
@@ -0,0 +1,49 @@
+# Cross-Repo Sync Showcase
+
+Evaluates whether a coding agent can keep two public repos in sync after one changes.
+
+## Scenario
+
+When **agentv** (EntityProcess/agentv) ships a feature, the **agentevals** (agentevals/agentevals) spec docs must be updated to reflect the change. This eval measures how well an agent handles that cross-repo synchronization.
+
+## Workspace Features Demonstrated
+
+| Feature | Usage |
+|---------|-------|
+| `workspace.template` | AGENTS.md + skills dir copied to workspace |
+| `workspace.before_each` | Clones agentevals at "before" state per test |
+| `workspace.after_each` | Resets git state between tests |
+| `metadata` | Commit SHAs passed to setup via stdin JSON |
+| `fileChanges` | Unified diff of agent's edits |
+
+## Test Cases
+
+1. **eval-spec-v2-sync** — Add 4 deterministic assert types + required gates
+2. **cases-to-tests-sync** — Rename `cases` → `tests` across spec docs
+3. **schema-field-rename-sync** — Rename `eval_cases` → `cases`, `expected_outcome` → `criteria`/`outcome`
+
+## Running
+
+```bash
+bun install
+bun agentv eval ./evals/dataset.eval.yaml
+```
+
+## Structure
+
+```
+├── evals/
+│   ├── dataset.eval.yaml          # 3 test cases
+│   └── ground-truth/              # Real diffs from commit history
+├── workspace-template/
+│   ├── AGENTS.md                  # Multi-repo context
+│   └── skills/
+│       └── cross-repo-sync.md     # Sync skill
+├── scripts/
+│   ├── setup.ts                   # before_each: clone repo
+│   ├── reset.ts                   # after_each: git reset
+│   └── validate-sync.ts           # Code judge
+├── .agentv/
+│   └── targets.yaml               # Mock CLI agent
+└── package.json
+```
@@ -0,0 +1,82 @@
+name: cross-repo-sync
+description: Evaluate agent ability to sync agentv implementation with agentevals spec
+version: "1.0"
+tags: [showcase, workspace, cross-repo]
+
+workspace:
+  template: ../workspace-template
+  before_each:
+    script: ["bun", "run", "../scripts/setup.ts"]
+    timeout_ms: 900000
+    cwd: .
+  after_each:
+    script: ["bun", "run", "../scripts/reset.ts"]
+    timeout_ms: 5000
+    cwd: .
+
+execution:
+  target: mock_agent
+
+tests:
+  - id: eval-spec-v2-sync
+    metadata:
+      agentevals_before: "9f8aa3a"
+      ground_truth: ../evals/ground-truth/eval-spec-v2.diff
+    criteria: >-
+      Update agentevals spec to reflect eval spec v2: add contains/regex/is_json/equals
+      assert types, required gates for all evaluators, tests-as-string-path.
+    input:
+      - role: user
+        content: |
+          agentv just merged eval spec v2 (PR #262). Update the agentevals
+          spec docs to reflect: 4 new deterministic assert types, required
+          gates, assert field at test/suite level, tests-as-string-path.
+    assert:
+      - name: sync-check
+        type: code_judge
+        script: ["bun", "run", "../scripts/validate-sync.ts"]
+        expected_files_modified:
+          - agentevals/docs/src/content/docs/specification/evaluators.mdx
+          - agentevals/docs/src/content/docs/specification/eval-format.mdx
+        expected_keywords: [contains, regex, is_json, equals, required, assert]
+
+  - id: cases-to-tests-sync
+    metadata:
+      agentevals_before: "1aaa26f"
+      ground_truth: ../evals/ground-truth/cases-to-tests.diff
+    criteria: >-
+      Rename 'cases' to 'tests' throughout the agentevals spec docs.
+    input:
+      - role: user
+        content: |
+          agentv renamed cases→tests in the eval schema (PR #240).
+          Update all agentevals spec docs to match.
+    assert:
+      - name: sync-check
+        type: code_judge
+        script: ["bun", "run", "../scripts/validate-sync.ts"]
+        expected_files_modified:
+          - agentevals/docs/src/content/docs/specification/eval-format.mdx
+          - agentevals/docs/src/content/docs/specification/evalcase-schema.mdx
+        expected_keywords: [tests]
+
+  - id: schema-field-rename-sync
+    metadata:
+      agentevals_before: "81f4b44"
+      ground_truth: ../evals/ground-truth/schema-field-rename.diff
+    criteria: >-
+      Rename eval_cases→cases and expected_outcome→criteria/outcome in agentevals spec.
+    input:
+      - role: user
+        content: |
+          agentv renamed schema fields: eval_cases→cases, expected_outcome→criteria
+          at case level, expected_outcome→outcome at rubric level (PR #202).
+          Update agentevals spec docs accordingly.
+    assert:
+      - name: sync-check
+        type: code_judge
+        script: ["bun", "run", "../scripts/validate-sync.ts"]
+        expected_files_modified:
+          - agentevals/docs/src/content/docs/specification/eval-format.mdx
+          - agentevals/docs/src/content/docs/specification/evalcase-schema.mdx
+        expected_keywords: [cases, criteria, outcome]