Harden full-pipeline export retries

fffoivos · fffoivos · commit dbea9d15e138 · 2026-04-03T21:47:21.000+03:00
diff --git a/docs/operations/deepseek_gcp_a100_setup.md b/docs/operations/deepseek_gcp_a100_setup.md
@@ -105,6 +105,24 @@ After correcting those bootstrap defects, the same fresh node was able to:
 - initialize a direct one-GPU `LLM(...)`
 - start a real `openarchives_ocr_run_node` workload with `runtime_backend=vllm`
 
+The same node was also used for a real `10`-PDF `extract -> clean -> ocr`
+checkpoint:
+
+- the stable end-to-end shape on that node was:
+  - multi-GPU extraction
+  - `workers_per_device=1`
+  - multi-GPU DeepSeek OCR with `workers_per_gpu=1`
+- an isolated extraction benchmark with `workers_per_device=2` was faster on the
+  same sample, but the first full-pipeline replay hit a Docling allocator crash:
+  - `malloc_consolidate(): unaligned fastbin chunk detected`
+- treat `workers_per_device=2` as benchmark-only / experimental until it is
+  proven stable in the full Corpus pipeline, not just in extract-only tests
+
+The full-pipeline checkpoint harness also now retries the JSONL export when OCR
+has already filled text into parquet rows but the first export pass still emits
+zero records. This guards the observed end-of-run export race on the benchmark
+node without changing the OCR output contract itself.
+
 ## Current runner expectation
 
 `glossapi.ocr.deepseek.runner._build_env()` now auto-discovers
diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py
@@ -106,6 +106,37 @@ def _count_jsonl_records(path: Path) -> int:
         return sum(1 for line in fp if line.strip())
 
 
+def _export_jsonl_with_retry(
+    corpus: Corpus,
+    *,
+    export_path: Path,
+    metadata_path: Path,
+    text_key: str,
+    metadata_key: str,
+    post_ocr_counts: Dict[str, int],
+    max_attempts: int = 4,
+    retry_delay_sec: float = 1.0,
+) -> int:
+    needs_retry = int(post_ocr_counts.get("text_nonempty", 0) or 0) > 0
+    attempts = max_attempts if needs_retry else 1
+
+    for attempt in range(attempts):
+        if export_path.exists():
+            export_path.unlink()
+        corpus.jsonl(
+            export_path,
+            text_key=text_key,
+            metadata_key=metadata_key,
+            include_remaining_metadata=False,
+            metadata_path=metadata_path,
+        )
+        export_records = _count_jsonl_records(export_path)
+        if export_records > 0 or attempt == attempts - 1:
+            return export_records
+        time.sleep(retry_delay_sec)
+    return 0
+
+
 def main(argv: Optional[List[str]] = None) -> int:
     args = _parse_args(argv)
     _apply_cli_tuning_overrides(args)
@@ -178,15 +209,15 @@ def main(argv: Optional[List[str]] = None) -> int:
     post_ocr_counts = _read_metadata_counts(metadata_path)
 
     export_start = time.perf_counter()
-    corpus.jsonl(
-        export_path,
+    export_records = _export_jsonl_with_retry(
+        corpus,
+        export_path=export_path,
+        metadata_path=metadata_path,
         text_key=str(args.text_key),
         metadata_key=str(args.metadata_key),
-        include_remaining_metadata=False,
-        metadata_path=metadata_path,
+        post_ocr_counts=post_ocr_counts,
     )
     export_elapsed = float(time.perf_counter() - export_start)
-    export_records = _count_jsonl_records(export_path)
 
     finished_at = time.time()
     report: Dict[str, Any] = {
diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py
@@ -208,3 +208,64 @@ def jsonl(self, output_path, **kwargs):
     assert rc == 0
     assert captured["repair_exec_batch_target_pages"] == 64
     assert captured["repair_exec_batch_target_items"] == 24
+
+
+def test_full_pipeline_checkpoint_retries_empty_export_when_ocr_text_exists(tmp_path, monkeypatch):
+    calls = {"jsonl": 0}
+
+    class DummyCorpus:
+        def __init__(self, input_dir, output_dir):
+            self.input_dir = input_dir
+            self.output_dir = output_dir
+
+        def _metadata_path(self):
+            path = self.output_dir / "download_results" / "download_results.parquet"
+            path.parent.mkdir(parents=True, exist_ok=True)
+            return path
+
+        def extract(self, **kwargs):
+            pd.DataFrame(
+                [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}]
+            ).to_parquet(self._metadata_path(), index=False)
+
+        def clean(self, **kwargs):
+            return None
+
+        def ocr(self, **kwargs):
+            pd.DataFrame(
+                [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}]
+            ).to_parquet(self._metadata_path(), index=False)
+
+        def jsonl(self, output_path, **kwargs):
+            calls["jsonl"] += 1
+            if calls["jsonl"] == 1:
+                output_path.write_text("", encoding="utf-8")
+                return
+            output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8")
+
+    monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus)
+
+    input_dir = tmp_path / "in"
+    input_dir.mkdir()
+    output_dir = tmp_path / "out"
+    export_path = tmp_path / "export.jsonl"
+    report_path = tmp_path / "report.json"
+
+    rc = checkpoint.main(
+        [
+            "--input-dir",
+            str(input_dir),
+            "--output-dir",
+            str(output_dir),
+            "--export-path",
+            str(export_path),
+            "--report-path",
+            str(report_path),
+        ]
+    )
+
+    assert rc == 0
+    assert calls["jsonl"] == 2
+    report = json.loads(report_path.read_text(encoding="utf-8"))
+    assert report["post_ocr_counts"]["text_nonempty"] == 1
+    assert report["export_records"] == 1