Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

from .._utils import get_int_env_var as get_int


LOGGER = logging.getLogger("run")
MISSING_VALUE: Final[int] = sys.maxsize

Expand Down Expand Up @@ -153,12 +152,20 @@ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
total_lines = run.result.total_lines if run.result else 0
failed_lines = run.result.failed_lines if run.result else 0

# Collect per-line error messages for failed lines
per_line_errors: Dict[int, str] = {}
if run.result and run.result.details:
for detail in run.result.details:
if detail.error and detail.error.exception:
per_line_errors[detail.index] = str(detail.error.exception)

return {
"status": run.status.value,
"duration": str(run.duration),
"completed_lines": total_lines - failed_lines,
"failed_lines": failed_lines,
"log_path": None,
"per_line_errors": per_line_errors,
"error_message": (
f"({run.result.error.blame.value}) {run.result.error.message}"
if run.result and run.result.error and run.result.error.blame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2364,7 +2364,7 @@ def _convert_single_row_to_aoai_format(
top_sample = sample

# Add error summaries if needed
_add_error_summaries(run_output_results, eval_run_summary, testing_criteria_metadata)
_add_error_summaries(run_output_results, eval_run_summary, testing_criteria_metadata, row_idx)

return {
"object": "eval.run.output_item",
Expand Down Expand Up @@ -2961,18 +2961,23 @@ def _add_error_summaries(
run_output_results: List[Dict[str, Any]],
eval_run_summary: Optional[Dict[str, Any]],
testing_criteria_metadata: Dict[str, Any],
row_idx: int = 0,
) -> None:
"""Add error summaries to results for failed evaluations.

This method processes evaluation run summary to add error result objects
for criteria that failed during evaluation, ensuring proper error reporting.
When per-line errors are available, only the error for the current row is used;
rows that succeeded are not stamped with another row's error.

:param run_output_results: List to append error result objects to
:type run_output_results: List[Dict[str, Any]]
:param eval_run_summary: Summary containing error information per criteria
:type eval_run_summary: Optional[Dict[str, Any]]
:param testing_criteria_metadata: Metadata about available testing criteria including metrics and types
:type testing_criteria_metadata: Dict[str, Any]
:param row_idx: Zero-based index of the current row, used to look up per-line errors
:type row_idx: int
:return: None (modifies run_output_results in place)
:rtype: None

Expand All @@ -2981,12 +2986,14 @@ def _add_error_summaries(
eval_run_summary = {
"coherence": {
"error_code": "TIMEOUT",
"error_message": "Evaluation timed out"
"error_message": "Evaluation timed out",
"per_line_errors": {0: "Row 0 timed out"}
}
}
testing_criteria_metadata = {
"coherence": {"metrics": ["score"], "type": "quality"}
}
row_idx = 0

Example Output:
run_output_results becomes [
Expand All @@ -3002,7 +3009,7 @@ def _add_error_summaries(
"sample": {
"error": {
"code": "TIMEOUT",
"message": "Evaluation timed out"
"message": "Row 0 timed out"
}
}
}
Expand All @@ -3015,10 +3022,26 @@ def _add_error_summaries(
if not isinstance(criteria_summary, dict) or criteria_summary.get("error_code") is None:
continue

error_info = {
"code": criteria_summary.get("error_code"),
"message": criteria_summary.get("error_message"),
}
# Use per-line error if available, otherwise fall back to batch-level error
per_line_errors = criteria_summary.get("per_line_errors", {})
per_line_error_msg = per_line_errors.get(row_idx, None)

if per_line_error_msg is not None:
# This row has a specific error
error_info = {
"code": criteria_summary.get("error_code"),
"message": per_line_error_msg,
}
elif per_line_errors:
# Per-line errors exist but not for this row — this row succeeded, skip error stamping
continue
else:
# No per-line errors available, fall back to batch-level error
error_info = {
"code": criteria_summary.get("error_code"),
"message": criteria_summary.get("error_message"),
}

sample = {"error": error_info} if error_info["code"] is not None else None

metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])
Expand Down