Agenta-AI · bekossy · May 26, 2026 · May 16, 2026 · May 17, 2026 · May 17, 2026
diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md
diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx
diff --git a/web/oss/src/components/EvalRunDetails/atoms/metrics.ts b/web/oss/src/components/EvalRunDetails/atoms/metrics.ts
@@ -13,6 +13,7 @@ import {getProjectValues} from "@/oss/state/project"
 import {previewEvalTypeAtom} from "../state/evalType"
 import {resolveValueBySegments, splitPath} from "../utils/valueAccess"
 
+import {isTerminalStatus} from "./compare"
 import {
     createMetricProcessor,
     isLegacyValueLeaf,
@@ -782,13 +783,23 @@ export const evaluationMetricQueryAtomFamily = atomFamily(
             const batcher = get(evaluationMetricBatcherFamily({runId: effectiveRunId}))
             const projectId = resolveProjectId(get)
 
+            // While the run is still executing, poll so a completing
+            // scenario's metrics surface in the table cells + focus drawer
+            // without a manual reload. Stops once the run is terminal.
+            const runQuery = effectiveRunId
+                ? get(evaluationRunQueryAtomFamily(effectiveRunId))
+                : undefined
+            const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
+            const runTerminal = isTerminalStatus(runStatus)
+
             return {
                 queryKey: ["preview", "evaluation-metric", effectiveRunId, projectId, scenarioId],
                 enabled: Boolean(projectId && effectiveRunId && batcher && scenarioId),
                 staleTime: 30_000,
                 gcTime: 5 * 60 * 1000,
                 refetchOnWindowFocus: false,
                 refetchOnReconnect: false,
+                refetchInterval: runTerminal ? false : 5000,
                 // Enable structural sharing to prevent unnecessary re-renders when data hasn't changed
                 structuralSharing: true,
                 queryFn: async () => {

diff --git a/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts b/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts
@@ -8,7 +8,9 @@ import type {IStepResponse} from "@/oss/lib/evaluations"
 import {snakeToCamelCaseKeys} from "@/oss/lib/helpers/casing"
 import {getProjectValues} from "@/oss/state/project"
 
+import {isTerminalStatus} from "./compare"
 import {activePreviewRunIdAtom, effectiveProjectIdAtom} from "./run"
+import {evaluationRunQueryAtomFamily} from "./table/run"
 import type {ScenarioStepsBatchResult} from "./types"
 
 const scenarioStepsBatcherCache = new Map<string, BatchFetcher<string, ScenarioStepsBatchResult>>()
@@ -128,11 +130,21 @@ export const scenarioStepsQueryFamily = atomFamily(
             const effectiveRunId = resolveEffectiveRunId(get, runId)
             const batcher = get(scenarioStepsBatcherFamily({runId: effectiveRunId}))
 
+            // While the run is still executing, poll so the focus drawer /
+            // scenario viewer pick up a scenario's results as it completes.
+            // Stops once the run is terminal.
+            const runQuery = effectiveRunId
+                ? get(evaluationRunQueryAtomFamily(effectiveRunId))
+                : undefined
+            const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
+            const runTerminal = isTerminalStatus(runStatus)
+
             return {
                 queryKey: ["preview", "scenario-steps", effectiveRunId, scenarioId],
                 enabled: Boolean(effectiveRunId && batcher && scenarioId),
                 refetchOnWindowFocus: false,
                 refetchOnReconnect: false,
+                refetchInterval: runTerminal ? false : 5000,
                 staleTime: 30_000,
                 gcTime: 5 * 60 * 1000,
                 // Enable structural sharing to prevent unnecessary re-renders when data hasn't changed

diff --git a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
@@ -499,9 +499,19 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
             }
 
             const evaluator = column.evaluatorId ? evaluatorById.get(column.evaluatorId) : undefined
+            // Match the evaluator's metric definition by the canonical
+            // metric key (e.g. "attributes.ag.data.outputs.score") OR the
+            // bare value key (e.g. "score"). `extractMetrics` keys metrics
+            // by the output-schema property name — the bare key — so a
+            // canonical-key-only match misses and `metricType` falls back
+            // to "string", mis-typing the column (e.g. a boolean output).
             const metricKey = column.metricKey || column.valueKey
             const metricDefinition = evaluator?.metrics.find(
-                (metric) => metric.name === metricKey || metric.path === metricKey,
+                (metric) =>
+                    metric.name === metricKey ||
+                    metric.path === metricKey ||
+                    metric.name === column.valueKey ||
+                    metric.path === column.valueKey,
             )
             const metricType =
                 metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK

diff --git a/web/oss/src/components/EvalRunDetails/components/Page.tsx b/web/oss/src/components/EvalRunDetails/components/Page.tsx
@@ -140,7 +140,7 @@ const EvalRunPreviewPage = ({runId, evaluationType, projectId = null}: EvalRunPr
             headerClassName="px-4 pt-2"
         >
             <div className="flex h-full min-h-0 flex-col gap-2 [&_.ant-tabs-content]:h-full [&_.ant-tabs-tabpane]:h-full">
-                <PreviewEvalRunMeta runId={runId} projectId={projectId} />
+                <PreviewEvalRunMeta runId={runId} projectId={projectId} activeView={activeView} />
                 <Tabs
                     className="flex-1 min-h-0 overflow-hidden"
                     activeKey={activeView}

diff --git a/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx b/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx
@@ -17,6 +17,7 @@ import {
     runTestsetIdsAtomFamily,
     runFlagsAtomFamily,
 } from "../atoms/runDerived"
+import ScenarioFilterBar from "../etl/ScenarioFilterBar"
 import {previewEvalTypeAtom} from "../state/evalType"
 
 import CompareRunsMenu from "./CompareRunsMenu"
@@ -137,10 +138,12 @@ const PreviewEvalRunMeta = ({
     runId,
     projectId,
     className,
+    activeView,
 }: {
     runId: string
     projectId?: string | null
     className?: string
+    activeView?: ActiveView
 }) => {
     const _invocationRefs = useAtomValue(useMemo(() => runInvocationRefsAtomFamily(runId), [runId]))
     const _testsetIds = useAtomValue(useMemo(() => runTestsetIdsAtomFamily(runId), [runId]))
@@ -220,6 +223,7 @@ const PreviewEvalRunMeta = ({
                         </Button>
                     </Tooltip>
                 ) : null}
+                {activeView === "scenarios" ? <ScenarioFilterBar runId={runId} /> : null}
                 <CompareRunsMenu runId={runId} />
             </div>
         </div>

diff --git a/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx b/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx
@@ -0,0 +1,74 @@
+/**
+ * EtlColumnHeader
+ *
+ * Renders the nested-header label for a column group. The default
+ * `computeColumnGroup` resolver falls back to `Testset <slug>` /
+ * `Application <slug>` because it doesn't fetch the entity itself.
+ *
+ * This header is that override — same pattern production's
+ * `StepGroupHeader` uses: subscribe to the entity reference atom by ID
+ * and surface the entity's name when available, fall back to the slug
+ * otherwise. Evaluator + metrics + other groups already carry
+ * `slugToTitle`-rendered labels, so no entity lookup is needed.
+ */
+
+import {useMemo} from "react"
+
+import type {ColumnGroup} from "@agenta/entities/evaluationRun/etl"
+import {Tooltip} from "antd"
+import {atom, useAtomValue} from "jotai"
+
+import {
+    applicationReferenceQueryAtomFamily,
+    testsetReferenceQueryAtomFamily,
+} from "../atoms/references"
+
+const emptyAtom = atom<{data: {name?: string; slug?: string} | null} | null>(null)
+
+interface EtlColumnHeaderProps {
+    group: ColumnGroup
+}
+
+const pickName = (entity: unknown): string | null => {
+    if (!entity || typeof entity !== "object") return null
+    const name = (entity as {name?: unknown}).name
+    return typeof name === "string" && name.length > 0 ? name : null
+}
+
+const EtlColumnHeader = ({group}: EtlColumnHeaderProps) => {
+    const refAtom = useMemo(() => {
+        if (group.kind === "testset") {
+            const id = (group.refs?.testset as {id?: string} | undefined)?.id
+            return id ? testsetReferenceQueryAtomFamily(id) : emptyAtom
+        }
+        if (group.kind === "application") {
+            const id = (group.refs?.application as {id?: string} | undefined)?.id
+            return id ? applicationReferenceQueryAtomFamily(id) : emptyAtom
+        }
+        return emptyAtom
+    }, [group])
+
+    const ref = useAtomValue(refAtom) as {data?: unknown} | null
+    const name = pickName(ref?.data ?? null)
+
+    const label = useMemo(() => {
+        switch (group.kind) {
+            case "testset":
+                return name ? `Testset ${name}` : group.label
+            case "application":
+                return name ? `Application ${name}` : group.label
+            default:
+                return group.label
+        }
+    }, [group.kind, group.label, name])
+
+    return (
+        <Tooltip title={label} placement="top">
+            <span className="block max-w-full overflow-hidden text-ellipsis whitespace-nowrap text-left">
+                {label}
+            </span>
+        </Tooltip>
+    )
+}
+
+export default EtlColumnHeader