@@ -208,3 +208,64 @@ def jsonl(self, output_path, **kwargs):
208208 assert rc == 0
209209 assert captured ["repair_exec_batch_target_pages" ] == 64
210210 assert captured ["repair_exec_batch_target_items" ] == 24
211+
212+
213+ def test_full_pipeline_checkpoint_retries_empty_export_when_ocr_text_exists (tmp_path , monkeypatch ):
214+ calls = {"jsonl" : 0 }
215+
216+ class DummyCorpus :
217+ def __init__ (self , input_dir , output_dir ):
218+ self .input_dir = input_dir
219+ self .output_dir = output_dir
220+
221+ def _metadata_path (self ):
222+ path = self .output_dir / "download_results" / "download_results.parquet"
223+ path .parent .mkdir (parents = True , exist_ok = True )
224+ return path
225+
226+ def extract (self , ** kwargs ):
227+ pd .DataFrame (
228+ [{"filename" : "doc.pdf" , "needs_ocr" : True , "ocr_success" : False , "text" : "" }]
229+ ).to_parquet (self ._metadata_path (), index = False )
230+
231+ def clean (self , ** kwargs ):
232+ return None
233+
234+ def ocr (self , ** kwargs ):
235+ pd .DataFrame (
236+ [{"filename" : "doc.pdf" , "needs_ocr" : False , "ocr_success" : True , "text" : "fixed text" }]
237+ ).to_parquet (self ._metadata_path (), index = False )
238+
239+ def jsonl (self , output_path , ** kwargs ):
240+ calls ["jsonl" ] += 1
241+ if calls ["jsonl" ] == 1 :
242+ output_path .write_text ("" , encoding = "utf-8" )
243+ return
244+ output_path .write_text (json .dumps ({"text" : "fixed text" }) + "\n " , encoding = "utf-8" )
245+
246+ monkeypatch .setattr (checkpoint , "Corpus" , DummyCorpus )
247+
248+ input_dir = tmp_path / "in"
249+ input_dir .mkdir ()
250+ output_dir = tmp_path / "out"
251+ export_path = tmp_path / "export.jsonl"
252+ report_path = tmp_path / "report.json"
253+
254+ rc = checkpoint .main (
255+ [
256+ "--input-dir" ,
257+ str (input_dir ),
258+ "--output-dir" ,
259+ str (output_dir ),
260+ "--export-path" ,
261+ str (export_path ),
262+ "--report-path" ,
263+ str (report_path ),
264+ ]
265+ )
266+
267+ assert rc == 0
268+ assert calls ["jsonl" ] == 2
269+ report = json .loads (report_path .read_text (encoding = "utf-8" ))
270+ assert report ["post_ocr_counts" ]["text_nonempty" ] == 1
271+ assert report ["export_records" ] == 1
0 commit comments