Azure · posaninagendra · Feb 23, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 23, 2026
@@ -24,7 +24,6 @@
 
 from .._utils import get_int_env_var as get_int
 
-
 LOGGER = logging.getLogger("run")
 MISSING_VALUE: Final[int] = sys.maxsize
 
@@ -153,12 +152,20 @@ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
         total_lines = run.result.total_lines if run.result else 0
         failed_lines = run.result.failed_lines if run.result else 0
 
+        # Collect per-line error messages for failed lines
+        per_line_errors: Dict[int, str] = {}
+        if run.result and run.result.details:
+            for detail in run.result.details:
+                if detail.error and detail.error.exception:
+                    per_line_errors[detail.index] = str(detail.error.exception)
+
         return {
             "status": run.status.value,
             "duration": str(run.duration),
             "completed_lines": total_lines - failed_lines,
             "failed_lines": failed_lines,
             "log_path": None,
+            "per_line_errors": per_line_errors,
             "error_message": (
                 f"({run.result.error.blame.value}) {run.result.error.message}"
                 if run.result and run.result.error and run.result.error.blame

@@ -2364,7 +2364,7 @@ def _convert_single_row_to_aoai_format(
             top_sample = sample
 
     # Add error summaries if needed
-    _add_error_summaries(run_output_results, eval_run_summary, testing_criteria_metadata)
+    _add_error_summaries(run_output_results, eval_run_summary, testing_criteria_metadata, row_idx)
 
     return {
         "object": "eval.run.output_item",
@@ -2961,18 +2961,23 @@ def _add_error_summaries(
     run_output_results: List[Dict[str, Any]],
     eval_run_summary: Optional[Dict[str, Any]],
     testing_criteria_metadata: Dict[str, Any],
+    row_idx: int = 0,
 ) -> None:
     """Add error summaries to results for failed evaluations.
 
     This method processes evaluation run summary to add error result objects
     for criteria that failed during evaluation, ensuring proper error reporting.
+    When per-line errors are available, only the error for the current row is used;
+    rows that succeeded are not stamped with another row's error.
 
     :param run_output_results: List to append error result objects to
     :type run_output_results: List[Dict[str, Any]]
     :param eval_run_summary: Summary containing error information per criteria
     :type eval_run_summary: Optional[Dict[str, Any]]
     :param testing_criteria_metadata: Metadata about available testing criteria including metrics and types
     :type testing_criteria_metadata: Dict[str, Any]
+    :param row_idx: Zero-based index of the current row, used to look up per-line errors
+    :type row_idx: int
     :return: None (modifies run_output_results in place)
     :rtype: None
 
@@ -2981,12 +2986,14 @@ def _add_error_summaries(
         eval_run_summary = {
             "coherence": {
                 "error_code": "TIMEOUT",
-                "error_message": "Evaluation timed out"
+                "error_message": "Evaluation timed out",
+                "per_line_errors": {0: "Row 0 timed out"}
             }
         }
         testing_criteria_metadata = {
             "coherence": {"metrics": ["score"], "type": "quality"}
         }
+        row_idx = 0
 
     Example Output:
         run_output_results becomes [
@@ -3002,7 +3009,7 @@ def _add_error_summaries(
                 "sample": {
                     "error": {
                         "code": "TIMEOUT",
-                        "message": "Evaluation timed out"
+                        "message": "Row 0 timed out"
                     }
                 }
             }
@@ -3015,10 +3022,26 @@ def _add_error_summaries(
         if not isinstance(criteria_summary, dict) or criteria_summary.get("error_code") is None:
             continue
 
-        error_info = {
-            "code": criteria_summary.get("error_code"),
-            "message": criteria_summary.get("error_message"),
-        }
+        # Use per-line error if available, otherwise fall back to batch-level error
+        per_line_errors = criteria_summary.get("per_line_errors", {})
+        per_line_error_msg = per_line_errors.get(row_idx, None)
+
+        if per_line_error_msg is not None:
+            # This row has a specific error
+            error_info = {
+                "code": criteria_summary.get("error_code"),
+                "message": per_line_error_msg,
+            }
+        elif per_line_errors:
+            # Per-line errors exist but not for this row — this row succeeded, skip error stamping
+            continue
+        else:
+            # No per-line errors available, fall back to batch-level error
+            error_info = {
+                "code": criteria_summary.get("error_code"),
+                "message": criteria_summary.get("error_message"),
+            }
+
         sample = {"error": error_info} if error_info["code"] is not None else None
 
         metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])