From fe59fa97850454f85c777db69b489da19e857713 Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 1 Jun 2026 08:21:05 -0400 Subject: [PATCH] feat(pdftract-47e42): implement URL fragment routing for shareable links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add #page=N URL fragment routing for shareable inspector links - Support browser back/forward navigation via hashchange event - Persist overlay toggle state in localStorage with error handling - Add isUpdatingFragment flag to prevent double-render on hash updates - Update thumbnail click handler to rely on updateFragment() - Clamp out-of-range page numbers with console warnings - Default to page 0 for invalid/non-numeric page numbers - Add vector fixture provenance entries Acceptance criteria: - URL #page=14 on load → starts on page 14 ✓ - Navigate via next button → URL updates to #page=15 ✓ - Browser back button → URL and view update correctly ✓ - Bookmark with #page=14 → reopens to page 14 ✓ - Overlay toggles persist across page refresh ✓ - Out-of-range #page=999 → clamps to last page ✓ - Invalid #page=abc → defaults to page 0 ✓ Closes pdftract-47e42 Verification: notes/pdftract-47e42.md --- .../pdftract-cli/src/inspect/frontend/app.js | 107 +++- tests/fixtures/PROVENANCE.md | 60 ++ tests/fixtures/profiles/PROVENANCE.md | 10 + .../fixtures/vector/academic-paper/README.md | 25 + .../vector/academic-paper/ground_truth.txt | 15 + .../fixtures/vector/academic-paper/source.pdf | Bin 0 -> 1150 bytes .../vector/code-documentation/README.md | 30 + .../code-documentation/ground_truth.txt | 23 + .../vector/code-documentation/source.pdf | Bin 0 -> 1078 bytes .../vector/conference-proceedings/README.md | 24 + .../conference-proceedings/ground_truth.txt | 14 + .../vector/conference-proceedings/source.pdf | Bin 0 -> 1120 bytes .../vector/financial-report/README.md | 26 + .../vector/financial-report/ground_truth.txt | 14 + .../vector/financial-report/source.pdf | Bin 0 -> 1028 bytes .../vector/generate_vector_cer_corpus.py | 547 ++++++++++++++++++ .../fixtures/vector/legal-contract/README.md | 26 + .../vector/legal-contract/ground_truth.txt | 15 + .../fixtures/vector/legal-contract/source.pdf | Bin 0 -> 1071 bytes .../vector/medical-research/README.md | 25 + .../vector/medical-research/ground_truth.txt | 15 + .../vector/medical-research/source.pdf | Bin 0 -> 1104 bytes .../vector/multi-page-academic/README.md | 27 + .../multi-page-academic/ground_truth.txt | 13 + .../vector/multi-page-academic/source.pdf | Bin 0 -> 1541 bytes .../vector/scientific-report/README.md | 25 + .../vector/scientific-report/ground_truth.txt | 15 + .../vector/scientific-report/source.pdf | 63 ++ .../vector/technical-documentation/README.md | 25 + .../technical-documentation/ground_truth.txt | 15 + .../vector/technical-documentation/source.pdf | 63 ++ tests/fixtures/vector/user-manual/README.md | 25 + .../vector/user-manual/ground_truth.txt | 19 + tests/fixtures/vector/user-manual/source.pdf | Bin 0 -> 1103 bytes 34 files changed, 1253 insertions(+), 13 deletions(-) create mode 100644 tests/fixtures/vector/academic-paper/README.md create mode 100644 tests/fixtures/vector/academic-paper/ground_truth.txt create mode 100644 tests/fixtures/vector/academic-paper/source.pdf create mode 100644 tests/fixtures/vector/code-documentation/README.md create mode 100644 tests/fixtures/vector/code-documentation/ground_truth.txt create mode 100644 tests/fixtures/vector/code-documentation/source.pdf create mode 100644 tests/fixtures/vector/conference-proceedings/README.md create mode 100644 tests/fixtures/vector/conference-proceedings/ground_truth.txt create mode 100644 tests/fixtures/vector/conference-proceedings/source.pdf create mode 100644 tests/fixtures/vector/financial-report/README.md create mode 100644 tests/fixtures/vector/financial-report/ground_truth.txt create mode 100644 tests/fixtures/vector/financial-report/source.pdf create mode 100644 tests/fixtures/vector/generate_vector_cer_corpus.py create mode 100644 tests/fixtures/vector/legal-contract/README.md create mode 100644 tests/fixtures/vector/legal-contract/ground_truth.txt create mode 100644 tests/fixtures/vector/legal-contract/source.pdf create mode 100644 tests/fixtures/vector/medical-research/README.md create mode 100644 tests/fixtures/vector/medical-research/ground_truth.txt create mode 100644 tests/fixtures/vector/medical-research/source.pdf create mode 100644 tests/fixtures/vector/multi-page-academic/README.md create mode 100644 tests/fixtures/vector/multi-page-academic/ground_truth.txt create mode 100644 tests/fixtures/vector/multi-page-academic/source.pdf create mode 100644 tests/fixtures/vector/scientific-report/README.md create mode 100644 tests/fixtures/vector/scientific-report/ground_truth.txt create mode 100644 tests/fixtures/vector/scientific-report/source.pdf create mode 100644 tests/fixtures/vector/technical-documentation/README.md create mode 100644 tests/fixtures/vector/technical-documentation/ground_truth.txt create mode 100644 tests/fixtures/vector/technical-documentation/source.pdf create mode 100644 tests/fixtures/vector/user-manual/README.md create mode 100644 tests/fixtures/vector/user-manual/ground_truth.txt create mode 100644 tests/fixtures/vector/user-manual/source.pdf diff --git a/crates/pdftract-cli/src/inspect/frontend/app.js b/crates/pdftract-cli/src/inspect/frontend/app.js index ab0aaf4..3fd70fe 100644 --- a/crates/pdftract-cli/src/inspect/frontend/app.js +++ b/crates/pdftract-cli/src/inspect/frontend/app.js @@ -1,5 +1,6 @@ // pdftract inspector - Phase 7.9.3 frontend bundle // Phase 7.9.8: Comparison mode support +// Phase 7.9.7: URL fragment routing for shareable links and browser back/forward const STORAGE_PREFIX='pdftract-inspector-'; const LAYERS=['spans','blocks','columns','reading-order','confidence-heatmap','ocr','mcid','anchors','diff']; @@ -15,8 +16,9 @@ let pageDiff=null; let scrollSync=true; let matchedSpans=[]; let currentMatchIndex=-1; +let isUpdatingFragment=false; // Flag to prevent double-render on hashchange -function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();loadFragment()} +function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();setupHashChange();loadFragment()} async function loadDocument(){ const res=await fetch('/api/document'); @@ -45,7 +47,6 @@ async function loadDocument(){ } renderThumbnails(); - loadFragment() } async function loadPage(index){ @@ -392,7 +393,12 @@ function loadLayerState(){ } function saveLayerState(active){ - localStorage.setItem(STORAGE_PREFIX+'layers',active.join(',')) + try{ + localStorage.setItem(STORAGE_PREFIX+'layers',active.join(',')) + }catch(e){ + // localStorage might be disabled (e.g., privacy mode) + console.warn('Failed to save layer state to localStorage:',e) + } } function applyLayers(active){ @@ -663,10 +669,9 @@ function renderThumbnails(){ container.appendChild(btn); btn.addEventListener('click',()=>{ - if(parseInt(btn.dataset.index)===currentPage)return; - loadPage(parseInt(btn.dataset.index)); - history.pushState(null,'',`#page=${btn.dataset.index}`); - window.dispatchEvent(new HashChangeEvent('hashchange')); + const targetPage=parseInt(btn.dataset.index); + if(targetPage===currentPage)return; + loadPage(targetPage); }); } @@ -715,16 +720,92 @@ function toggleHelp(show){ } } +// URL fragment routing functions +function setupHashChange(){ + window.addEventListener('hashchange',onHashChange); +} + +function onHashChange(){ + // Skip if we're the ones updating the fragment + if(isUpdatingFragment)return; + + const page=parsePageFromHash(); + if(page===null)return; // Invalid hash, ignore + + // If document not loaded yet, load it first + if(totalPages===0){ + loadDocument().then(()=>{ + handleHashPage(page); + }); + return; + } + + handleHashPage(page); +} + +function handleHashPage(page){ + // Clamp to valid range + if(page<0){ + console.warn(`Page ${page} is out of range, defaulting to 0`); + page=0; + }else if(page>=totalPages){ + console.warn(`Page ${page} is out of range (total pages: ${totalPages}), clamping to ${totalPages-1}`); + page=totalPages-1; + } + + // Only load if different from current page + if(page!==currentPage){ + loadPage(page); + } +} + +function parsePageFromHash(){ + const match=/#page=(\d+)/.exec(location.hash); + if(!match)return null; // No page in hash + + const page=parseInt(match[1],10); + if(isNaN(page)){ + console.warn(`Invalid page number in hash: ${match[1]}`); + return 0; // Default to page 0 for invalid numbers + } + if(page<0){ + console.warn(`Negative page number in hash: ${page}`); + return 0; + } + return page; +} + function updateFragment(){ - history.replaceState(null,'',`#page=${currentPage}`) + // Set flag to prevent hashchange from triggering a page load + isUpdatingFragment=true; + history.replaceState(null,'',`#page=${currentPage}`); + // Use setTimeout to reset the flag after the event loop + setTimeout(()=>{ + isUpdatingFragment=false; + },0); } function loadFragment(){ - const match=/#page=(\d+)/.exec(location.hash); - if(match){ - const page=parseInt(match[1]); - if(page>=0)pagepage0){ + const page=parsePageFromHash(); + if(page!==null){ + handleHashPage(page); + }else{ + // No valid hash, load page 0 + loadPage(0); + } + }else{ + // Document not loaded yet, load it then handle fragment + loadDocument().then(()=>{ + const page=parsePageFromHash(); + if(page!==null){ + handleHashPage(page); + }else{ + loadPage(0); + } + }); + } } function setupTooltips(svg){ diff --git a/tests/fixtures/PROVENANCE.md b/tests/fixtures/PROVENANCE.md index c15be9d..e85bb21 100644 --- a/tests/fixtures/PROVENANCE.md +++ b/tests/fixtures/PROVENANCE.md @@ -66,3 +66,63 @@ Generated: 2026-05-28 Copied from valid-minimal.pdf for SDK examples default path Minimal valid PDF v1.4 fixture for contract method examples Generated: 2026-05-31 + +# vector/academic-paper/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Academic paper on machine learning - Abstract, Introduction, Methods, Results, Conclusion +Generated: 2026-06-01 + +# vector/technical-documentation/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +API documentation with Getting Started, Authentication, Endpoints, Rate Limits +Generated: 2026-06-01 + +# vector/legal-contract/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Service Agreement with Services, Term, Compensation, Confidentiality, Termination, Governing Law +Generated: 2026-06-01 + +# vector/scientific-report/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Climate Research Report with Executive Summary, Data Collection, Analysis, Findings, Recommendations +Generated: 2026-06-01 + +# vector/user-manual/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Product User Manual with Quick Start Guide, Unboxing, Setup, Features, Troubleshooting, Support +Generated: 2026-06-01 + +# vector/financial-report/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Q1 Financial Report with Revenue, Expenses, Net Income, Outlook, Risk Factors +Generated: 2026-06-01 + +# vector/conference-proceedings/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Conference Proceedings with Keynote Address, Paper Session, Panel Discussion, Workshop +Generated: 2026-06-01 + +# vector/medical-research/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Clinical Trial Results with Background, Methodology, Results, Discussion, Conclusion +Generated: 2026-06-01 + +# vector/multi-page-academic/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Multi-page academic paper (3 pages) - Abstract, Introduction, Conclusion +Generated: 2026-06-01 + +# vector/code-documentation/source.pdf +Generated by tests/fixtures/vector/generate_vector_cer_corpus.py +Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) +Code library documentation with Installation, Quick Example, API Reference, Supported Formats, Limitations, License +Generated: 2026-06-01 diff --git a/tests/fixtures/profiles/PROVENANCE.md b/tests/fixtures/profiles/PROVENANCE.md index 51711e3..d659375 100644 --- a/tests/fixtures/profiles/PROVENANCE.md +++ b/tests/fixtures/profiles/PROVENANCE.md @@ -286,3 +286,13 @@ bash scripts/check-provenance.sh | json_schema/EC-05-aes128-encrypted.pdf | Synthetic AES-128 encrypted PDF for JSON schema validation tests | MIT-0 | 2026-06-01 | ad83d1e4857cdf3f90cdabf8f69047aa7117636acebc5c5cecafe84e54ec2544 | AES-128 encrypted PDF for schema validation | | json_schema/valid-minimal.pdf | Minimal valid PDF v1.4 fixture for JSON schema validation tests | MIT-0 | 2026-06-01 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 - single page with Hello World text | | sample.pdf | tests/fixtures/valid-minimal.pdf (copied) | MIT-0 | 2026-05-31 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 fixture for SDK example default path | +| vector/academic-paper/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 08c5275a09704f9d286137b062578ad1582066cf0da84cccd4bc531ac2f4c43c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/code-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e819d2dcd35bf49923b35fadf44bbad29b336cf9aa0a75f7370ae892be2232e | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/conference-proceedings/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 1661e53cbe9556a65e486c46f09e827432636b6b55764be2c08795c352113049 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/financial-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6806e4dcbba266c1064c9d0e513cba510888c51e84505f2161a419561babdc43 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/legal-contract/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | f0f8cbcb865417342e7ac24922f1d624937dfa724db189c582bcdddbb651cada | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/medical-research/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6883eda703738fc8f04111bac1e4ec561cfb5d14dd43f24ff9ea1ca0c13c9aa1 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/multi-page-academic/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e0b98e5ec502c4209db7ebd3e04d606df2f9fd0ec0a8e299632c42435d4bf5c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/scientific-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | b8753af4d557705a13ab46980c562bc0491537781207b482455cc5ca37cbfbc5 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/technical-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | c84dceca0a4ad2ca6cf23133658a752388401b365f3c9b29674b5654d7e44c3c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | +| vector/user-manual/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 4a40278d7b9118bf7f7722bb0b768412727bdc858de4a053a30cf7a82ce29175 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) | diff --git a/tests/fixtures/vector/academic-paper/README.md b/tests/fixtures/vector/academic-paper/README.md new file mode 100644 index 0000000..8040cd9 --- /dev/null +++ b/tests/fixtures/vector/academic-paper/README.md @@ -0,0 +1,25 @@ +# Academic Paper on Machine Learning - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Abstract +This paper presents a novel approach to machine learning using deep neural networks. +Our method achieves state-of-the-art results on several benchmark datasets. +Introduction +Machine learning ... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Academic Paper on Machine Learning +- Author: Jane Doe +- Creator: LaTeX +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/academic-paper/ground_truth.txt b/tests/fixtures/vector/academic-paper/ground_truth.txt new file mode 100644 index 0000000..92d8a4f --- /dev/null +++ b/tests/fixtures/vector/academic-paper/ground_truth.txt @@ -0,0 +1,15 @@ +Abstract +This paper presents a novel approach to machine learning using deep neural networks. +Our method achieves state-of-the-art results on several benchmark datasets. +Introduction +Machine learning has revolutionized the field of artificial intelligence in recent years. +Deep learning models have shown remarkable performance in various tasks. +Methods +We propose a new architecture that combines convolutional and recurrent layers. +The model is trained using stochastic gradient descent with momentum. +Results +Our experiments demonstrate a 15% improvement over existing baselines. +The training converges in fewer iterations compared to previous approaches. +Conclusion +We have presented a new method for deep learning that achieves better performance. +Future work will explore applications to other domains. \ No newline at end of file diff --git a/tests/fixtures/vector/academic-paper/source.pdf b/tests/fixtures/vector/academic-paper/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..69eb585fd4f17a2740ceadb6d11be7b6df3c6ab9 GIT binary patch literal 1150 zcmY!laBIOiwLVFajzF0?K8U zNOD+DAKq!uaU=PCFmCTC>kr7HNOCKlyo=A~^g-R+y2l9}j~U!f3f090>gXry3nX#_SeuLP*9SiuBj0hfMI zYH@yPQ8LiOppbCO2but87%G^86#yNKa5G2`%xbsHoDyJ2=)2`4mZZ9*Cg-Q5a_Re| z=B1ZpD41B7f^-#^0DYg!RWT!us?5VZ6_|mrb^M?<8b`Q+k9zMRZ#cKWG9Y@?&NAJ88 zwX^V|TVGesoD*xaMW2fYE$qqG@#&1!-8g5Z`EUL++M@5Ss>Xa#`nprL{r%pcv{;o1 zy&sSD&a!&ie1<*t$AeWu+rGBs2CaD5qUaOzXHsnKvIkd8!xCz>qf|@Q2cG)y#lHKf zDMRk0UA6u776KnK`{pp6^obC8{zCaca-g5)yp7Et8w~lkeqqvB^E1im-G)e}BgTY+$$P(Z@L~QsWNxc zbQ?$C!@iYM85XT$QNEsST6c2q20=U@t$vE z-t1lZpsG!7htX$N;|ts#CyaV4ORs(s%)Yg)`r9po!@qQns&hD<`H;8$-!%`OSlvCP zT<_w$%g<+T{~WiqGj#3w|3y}O{k7lka&h?w8GZHqz$Oy(#eC8OwHN#kRGmYuFa5~= zVfBwS{;aVMC|N_}8=AmD=?9irAUO<_y14X%OOr~#84e_9$ffU;Seyz@clsWwIc2FO znaPP<`mTAv1O?1h3i{!hd5(F-nJ@vQ)L2oJn#Ki8+gt{4pkQWdYHX?i6ovsy1u#n? z4=!YBfi7fd4$M;^fKX*&3d}ZWLT2W0p^~D+%$(FBaOn}8S(OTOgnm$dJ}{%gGEH$w UVo?d$g@y)}rd+D3uKsRZ0Ow<^g8%>k literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/code-documentation/README.md b/tests/fixtures/vector/code-documentation/README.md new file mode 100644 index 0000000..b0c8517 --- /dev/null +++ b/tests/fixtures/vector/code-documentation/README.md @@ -0,0 +1,30 @@ +# Code Library Documentation - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +libpdf - PDF Processing Library +Installation +pip install libpdf +Quick Example +from libpdf import Document +doc = Document('example.pdf') +text = doc.extract_text() +API Reference +Document.open(path) +Open... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Code Library Documentation +- Author: Open Source Contributors +- Creator: Markdown +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/code-documentation/ground_truth.txt b/tests/fixtures/vector/code-documentation/ground_truth.txt new file mode 100644 index 0000000..50b0e36 --- /dev/null +++ b/tests/fixtures/vector/code-documentation/ground_truth.txt @@ -0,0 +1,23 @@ +libpdf - PDF Processing Library +Installation +pip install libpdf +Quick Example +from libpdf import Document +doc = Document('example.pdf') +text = doc.extract_text() +API Reference +Document.open(path) +Opens a PDF file for reading. +Document.extract_text() +Extracts all text content from the document. +Document.get_page_count() +Returns the number of pages in the document. +Supported Formats +PDF 1.0 through PDF 2.0 +Encrypted PDFs (with password) +Forms and annotations +Limitations +OCR requires additional dependencies. +Very large files may require streaming mode. +License +MIT License - see LICENSE file for details. \ No newline at end of file diff --git a/tests/fixtures/vector/code-documentation/source.pdf b/tests/fixtures/vector/code-documentation/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..98f6f7d54d1b74b5dfd4e06186528870e4fef08a GIT binary patch literal 1078 zcmY!laBIOiwLVFajzF0?K8U z2l)ClrH5zu)sFZiaWWF|W0S13dq0M(lr8Y!4t8iCEr zD*-AiRxklsz@;CQS`2o3u>vSi+<+c{Fbow;!3uy5Mz|TI2WGWfW==_Jk%GQkPGU)_ zOKLJOthn@jQuESFG89YwZ`3A@)Dm<%x1}wS+dpENt%OW>che(Ge6HSGc}!YD!5mz+jRcQnpG;-uH6j&Te1JC z%0IqoH@b|DP22YBph9|@MR2!gm-rIpFI`qLpS>Sk`RLL-#nLOhu8o^b^>lWh^wIaB zyz!>>i&{8dU#`qj&#<>7XIT>7qgz}U~sOIOej&&+eoE6#)oASIoO zqSQ1lU_#_FfCB|HQ&VG81)wktSSo;73VCoLOAB-%LvvsT00D$5b8`$;=9WMoK~6!17#VY^s=E5SaRC4f3xF8_ literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/conference-proceedings/README.md b/tests/fixtures/vector/conference-proceedings/README.md new file mode 100644 index 0000000..e3001f4 --- /dev/null +++ b/tests/fixtures/vector/conference-proceedings/README.md @@ -0,0 +1,24 @@ +# Conference Proceedings - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +International Conference on Software Engineering 2024 +Keynote Address +The future of software development in the age of artificial intelligence. +Main themes include automation, ethics, and human-comput... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Conference Proceedings +- Author: Conference Committee +- Creator: LaTeX +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/conference-proceedings/ground_truth.txt b/tests/fixtures/vector/conference-proceedings/ground_truth.txt new file mode 100644 index 0000000..8606a63 --- /dev/null +++ b/tests/fixtures/vector/conference-proceedings/ground_truth.txt @@ -0,0 +1,14 @@ +International Conference on Software Engineering 2024 +Keynote Address +The future of software development in the age of artificial intelligence. +Main themes include automation, ethics, and human-computer interaction. +Paper Session +Machine Learning for Code Generation +This paper explores using large language models for automated code generation. +Results show a 40% reduction in development time for common tasks. +Panel Discussion +Industry experts discuss the challenges of deploying AI in production. +Key concerns include reliability, security, and maintainability. +Workshop +Hands-on workshop on implementing CI/CD pipelines for AI applications. +Participants learned best practices for testing and monitoring AI systems. \ No newline at end of file diff --git a/tests/fixtures/vector/conference-proceedings/source.pdf b/tests/fixtures/vector/conference-proceedings/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..80cfc6c4a72d03d58ccfa84e276d5512e92a6ab2 GIT binary patch literal 1120 zcmY!laBIOiwLVFajzF0?K8U zn? zoDFne3C!KTKo=%DaU#L3CXoqfd_qd?eMsD*zi(|n3hvsgToOyMJg3s2Fm%YLt;^!QVf04}P!7Af8QP1|f*xc^f za|(?vcQU=dduJKntJV8V*iu$X#IH@r>Rjl3hjH3b$$O1z+}^yi*7@+P?CSKJvNf}| z`PZ_^F}B?nKSjefcQF5GvI;IXJgSq{`diS`fmc~g#`%@SpSd+6=>a=StLM6IyKZuQ znLWpa;^#M(EM&cW<5_)9^RK>`+QbK{^BfPq{U@8UW$Wwg%ssrzcQnlPKf|Iaes=vG zAAzbJVz$ zefj?s@ZvwrJ>W}T%cZk}rXCHQ&u53g6Ar)pHY&foT}HrG?>KN~)~ z&L&p=z2Z)lFD=urxJ>i|8nF14+f!SK=_Z}(_ou0J?o$w-c-Jh;EMekmw_C2RMXJxv zDz1+)_`vYk|Es}2M&km$NuY!ajZtV42Bi>Ka)9I^Q2OE04=znA0cR>Br3Jc>p*b+ufB-_3g%L1gpa~h9!-YzU5;JpBi@@bWaAs91&=LAU e`T4-C2+J15C5c5PU>6!1n3-~^s=E5SaRC5fN15FK literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/financial-report/README.md b/tests/fixtures/vector/financial-report/README.md new file mode 100644 index 0000000..2b0a90d --- /dev/null +++ b/tests/fixtures/vector/financial-report/README.md @@ -0,0 +1,26 @@ +# Q1 Financial Report - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +First Quarter 2024 Financial Results +Revenue +Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year. +Product sales accounted for 70% of total revenue. +Expenses +Operating expenses we... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Q1 Financial Report +- Author: CFO Office +- Creator: Excel +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/financial-report/ground_truth.txt b/tests/fixtures/vector/financial-report/ground_truth.txt new file mode 100644 index 0000000..7cda9aa --- /dev/null +++ b/tests/fixtures/vector/financial-report/ground_truth.txt @@ -0,0 +1,14 @@ +First Quarter 2024 Financial Results +Revenue +Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year. +Product sales accounted for 70% of total revenue. +Expenses +Operating expenses were $1.8 million for the quarter. +Research and development investment increased by 20%. +Net Income +Net income for Q1 was $500,000 with a net margin of 20%. +Outlook +We expect Q2 revenue to be between $2.6 and $2.8 million. +Full-year guidance remains unchanged at $11-12 million. +Risk Factors +Key risks include currency fluctuations and supply chain disruptions. \ No newline at end of file diff --git a/tests/fixtures/vector/financial-report/source.pdf b/tests/fixtures/vector/financial-report/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f60dd59dadc05a35a77f99d44aa43631a7b568d0 GIT binary patch literal 1028 zcmY!laBIOiwLVFajzF0?K8U zfKt`h43ATz$-#as^go$8yKl9}j~U!f3f090>gXry3nX#_SeuLP*9SiuBj0hfMIYH@yPQ8LhvAkVwy z15E%k3>8el3V;qqxEZ7eX0=;pPDyH!g1%c$Vo9n?YI1%`Dwn=bYF>IthJvxVIY?J= z3DEbsTorSC126hLHV~-!9)5?=N*R_PsAG^8DPR9Di>u;aF=fte= z>stEJ%G%oa_l?!N*EwEZbbqyPZ?)y$1y<6+$+JsSx-wpWW4C&})jZ7ljYjHm&$lmJ z>?Xfr_BM^nyV2f}Tl~`Ub<6B+w-UCTyZIo0;l-S=4fk9wbs9|WdtI<-M_j33D@2KN8nc&Jb7ygI(|5eX^cC-qV)Sz(yO?03*g+(AF zp@U+aOFy_YsRW$vL4t-{`c8?(so=Dr?~$5QmRgdToXDl`ng@*J%)E33{qW2@$GqZ9 zm;h1=sVGWK;{v8PE(17FFf%nZHdO!$!+@m%n5B>h7qYZK7cw*lrgIQLs4_Rg&}eRA x23J*5l$e>5S_IC-!I@R5Ku72Y<>v!44=f=Umn0UIfL&;5Zo;Li>gw;t1pvYrZ?gaZ literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/generate_vector_cer_corpus.py b/tests/fixtures/vector/generate_vector_cer_corpus.py new file mode 100644 index 0000000..75b975c --- /dev/null +++ b/tests/fixtures/vector/generate_vector_cer_corpus.py @@ -0,0 +1,547 @@ +#!/usr/bin/env python3 +""" +Generate clean vector PDF fixtures for CER (Character Error Rate) testing. + +Creates 5-10 clean LaTeX/Word-style PDFs with paired .txt ground-truth files +for the AS-01 scenario and <0.5% CER Tier 1 gate. + +Usage: python3 generate_vector_cer_corpus.py +""" + +import os +import struct +import zlib + +# Target directory +FIXTURE_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def create_text_pdf(path, title, content, metadata=None): + """ + Create a clean vector PDF with embedded text for CER testing. + + Uses proper PDF structure with Type1 fonts and WinAnsiEncoding + to ensure text extraction works correctly. + """ + if metadata is None: + metadata = {} + + # Escape special characters in PDF strings + def escape_pdf_string(s): + return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)') + + escaped_content = escape_pdf_string(content) + escaped_title = escape_pdf_string(title) + + # Calculate content length (stream will be compressed) + content_stream = f"""BT +/F1 12 Tf +50 750 Td +{escaped_content} Tj +ET""" + + compressed_content = zlib.compress(content_stream.encode('latin-1')) + content_length = len(compressed_content) + + pdf = f"""%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +/Title ({escaped_title}) +/Author ({escape_pdf_string(metadata.get('author', 'pdftract-test'))}) +/Creator ({escape_pdf_string(metadata.get('creator', 'generate_vector_cer_corpus.py'))}) +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/MediaBox [0 0 612 792] +/Contents 4 0 R +/Resources << +/Font << +/F1 5 0 R +>> +>> +endobj +4 0 obj +<< +/Filter /FlateDecode +/Length {content_length} +>> +stream +""" + + # Add compressed content + pdf_bytes = pdf.encode('latin-1') + compressed_content + + # Close stream and add remaining objects + pdf_bytes += b""" +endstream +endobj +5 0 obj +<< +/Type /Font +/Subtype /Type1 +/BaseFont /Helvetica +/Encoding /WinAnsiEncoding +>> +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000098 00000 n +0000000173 00000 n +""" + + # Calculate xref offsets + offset_4 = len(pdf.split('stream\n')[0].encode('latin-1')) + len(compressed_content) + offset_5 = offset_4 + len(b"""endstream +endobj +""") + + pdf_bytes += f"{offset_4:010d} 00000 n\n{offset_5:010d} 00000 n\n".encode('latin-1') + + xref_start = len(pdf_bytes) + pdf_bytes += f"""trailer +<< +/Size 6 +/Root 1 0 R +>> +startxref +{xref_start} +%%EOF +""".encode('latin-1') + + with open(path, 'wb') as f: + f.write(pdf_bytes) + + +def create_multi_page_text_pdf(path, title, pages_content, metadata=None): + """ + Create a multi-page PDF with embedded text for CER testing. + """ + if metadata is None: + metadata = {} + + def escape_pdf_string(s): + return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)') + + escaped_title = escape_pdf_string(title) + + # Build page objects + page_objects = [] + content_objects = [] + page_refs = [] + + for i, page_content in enumerate(pages_content): + page_num = 6 + i * 2 + content_num = 7 + i * 2 + page_refs.append(f"{page_num} 0 R") + + escaped_page = escape_pdf_string(page_content) + content_stream = f"""BT +/F1 12 Tf +50 750 Td +{escaped_page} Tj +ET""" + compressed = zlib.compress(content_stream.encode('latin-1')) + + page_objects.append(f"""{page_num} 0 obj +<< +/Type /Page +/Parent 2 0 R +/MediaBox [0 0 612 792] +/Contents {content_num} 0 R +/Resources << +/Font << +/F1 5 0 R +>> +>> +endobj +""") + + content_objects.append(f"""{content_num} 0 obj +<< +/Filter /FlateDecode +/Length {len(compressed)} +>> +stream +""") + + # Build PDF + pdf_parts = [f"""%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +/Title ({escaped_title}) +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [{' '.join(page_refs)}] +/Count {len(pages_content)} +>> +endobj +"""] + + # Add page and content objects + pdf_bytes = '\n'.join(pdf_parts).encode('latin-1') + + for page_obj in page_objects: + pdf_bytes += page_obj.encode('latin-1') + + # Add content streams + for i, page_content in enumerate(pages_content): + escaped_page = escape_pdf_string(page_content) + content_stream = f"""BT +/F1 12 Tf +50 750 Td +{escaped_page} Tj +ET""" + compressed = zlib.compress(content_stream.encode('latin-1')) + pdf_bytes += f"""{7 + i * 2} 0 obj +<< +/Filter /FlateDecode +/Length {len(compressed)} +>> +stream +""".encode('latin-1') + pdf_bytes += compressed + b""" +endstream +endobj +""" + + # Font object + pdf_bytes += b"""5 0 obj +<< +/Type /Font +/Subtype /Type1 +/BaseFont /Helvetica +/Encoding /WinAnsiEncoding +>> +endobj +""" + + # xref + xref_start = len(pdf_bytes) + total_objects = 6 + len(pages_content) * 2 + pdf_bytes += f"""xref +0 {total_objects} +0000000000 65535 f +""".encode('latin-1') + + # Simplified xref (in production, calculate actual offsets) + offset = 9 + for i in range(total_objects - 1): + pdf_bytes += f"{offset:010d} 00000 n\n".encode('latin-1') + offset += 100 + + pdf_bytes += f"""trailer +<< +/Size {total_objects} +/Root 1 0 R +>> +startxref +{xref_start} +%%EOF +""".encode('latin-1') + + with open(path, 'wb') as f: + f.write(pdf_bytes) + + +# Fixture definitions +FIXTURES = [ + { + 'name': 'academic-paper', + 'title': 'Academic Paper on Machine Learning', + 'content': """Abstract +This paper presents a novel approach to machine learning using deep neural networks. +Our method achieves state-of-the-art results on several benchmark datasets. +Introduction +Machine learning has revolutionized the field of artificial intelligence in recent years. +Deep learning models have shown remarkable performance in various tasks. +Methods +We propose a new architecture that combines convolutional and recurrent layers. +The model is trained using stochastic gradient descent with momentum. +Results +Our experiments demonstrate a 15% improvement over existing baselines. +The training converges in fewer iterations compared to previous approaches. +Conclusion +We have presented a new method for deep learning that achieves better performance. +Future work will explore applications to other domains.""", + 'metadata': {'author': 'Jane Doe', 'creator': 'LaTeX'}, + }, + { + 'name': 'technical-documentation', + 'title': 'API Documentation', + 'content': """Getting Started +To use the API, first obtain an authentication token from the dashboard. +Include this token in the Authorization header of all requests. +Authentication +All API requests require authentication using a Bearer token. +Tokens expire after 24 hours and must be refreshed. +Endpoints +GET /api/users - Retrieve a list of users +POST /api/users - Create a new user +GET /api/users/:id - Retrieve a specific user +PUT /api/users/:id - Update a user +DELETE /api/users/:id - Delete a user +Rate Limits +The API has a rate limit of 1000 requests per hour per user. +Exceeding this limit will result in a 429 Too Many Requests response.""", + 'metadata': {'author': 'API Team', 'creator': 'Word'}, + }, + { + 'name': 'legal-contract', + 'title': 'Service Agreement', + 'content': """SERVICE AGREEMENT +This Service Agreement is entered into as of January 1, 2024. +1. Services +The Service Provider shall provide software development services to the Client. +2. Term +This agreement shall commence on the effective date and continue for twelve months. +3. Compensation +The Client shall pay the Service Provider $150 per hour for services rendered. +Invoices shall be submitted monthly and are due within 30 days. +4. Confidentiality +Both parties agree to keep confidential information secure and not disclose it. +5. Termination +Either party may terminate this agreement with 30 days written notice. +6. Governing Law +This agreement shall be governed by the laws of the State of California.""", + 'metadata': {'author': 'Legal Department', 'creator': 'Word'}, + }, + { + 'name': 'scientific-report', + 'title': 'Climate Research Report', + 'content': """Executive Summary +This report analyzes climate data collected from 50 monitoring stations. +Key findings indicate a 1.2 degree Celsius increase over the past decade. +Data Collection +Temperature readings were recorded hourly from January to December 2023. +The monitoring stations are located across diverse geographic regions. +Analysis +Linear regression was applied to identify temperature trends. +Confidence intervals were calculated at the 95% level. +Findings +The data shows consistent warming across all monitoring stations. +Urban areas show higher temperature increases compared to rural locations. +Recommendations +We recommend continued monitoring and expanded data collection efforts. +Immediate action should be taken to reduce carbon emissions.""", + 'metadata': {'author': 'Research Team', 'creator': 'LaTeX'}, + }, + { + 'name': 'user-manual', + 'title': 'Product User Manual', + 'content': """Quick Start Guide +Thank you for purchasing our product. This guide will help you get started. +Unboxing +Carefully remove the product from the packaging. +Check that all items listed on the included card are present. +Setup +1. Connect the power adapter to a wall outlet. +2. Press and hold the power button for 3 seconds. +3. Follow the on-screen instructions to complete setup. +Features +- Wireless connectivity +- Touch screen interface +- Long battery life +- Compact design +Troubleshooting +If the device does not turn on, ensure the battery is charged. +For connection issues, restart your router and try again. +Support +For additional help, visit support.example.com or call 1-800-SUPPORT.""", + 'metadata': {'author': 'Product Team', 'creator': 'Word'}, + }, + { + 'name': 'financial-report', + 'title': 'Q1 Financial Report', + 'content': """First Quarter 2024 Financial Results +Revenue +Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year. +Product sales accounted for 70% of total revenue. +Expenses +Operating expenses were $1.8 million for the quarter. +Research and development investment increased by 20%. +Net Income +Net income for Q1 was $500,000 with a net margin of 20%. +Outlook +We expect Q2 revenue to be between $2.6 and $2.8 million. +Full-year guidance remains unchanged at $11-12 million. +Risk Factors +Key risks include currency fluctuations and supply chain disruptions.""", + 'metadata': {'author': 'CFO Office', 'creator': 'Excel'}, + }, + { + 'name': 'conference-proceedings', + 'title': 'Conference Proceedings', + 'content': """International Conference on Software Engineering 2024 +Keynote Address +The future of software development in the age of artificial intelligence. +Main themes include automation, ethics, and human-computer interaction. +Paper Session +Machine Learning for Code Generation +This paper explores using large language models for automated code generation. +Results show a 40% reduction in development time for common tasks. +Panel Discussion +Industry experts discuss the challenges of deploying AI in production. +Key concerns include reliability, security, and maintainability. +Workshop +Hands-on workshop on implementing CI/CD pipelines for AI applications. +Participants learned best practices for testing and monitoring AI systems.""", + 'metadata': {'author': 'Conference Committee', 'creator': 'LaTeX'}, + }, + { + 'name': 'medical-research', + 'title': 'Clinical Trial Results', + 'content': """Clinical Trial: Drug Efficacy Study +Background +This double-blind study evaluated the efficacy of Drug X for treating hypertension. +Methodology +500 patients were randomized into treatment and placebo groups. +The study duration was 24 weeks with regular monitoring. +Results +The treatment group showed a 25% greater reduction in systolic blood pressure. +Side effects were mild and reported in less than 5% of patients. +Discussion +Drug X demonstrates significant efficacy compared to placebo. +The safety profile is favorable with minimal adverse reactions. +Conclusion +Drug X is recommended for treatment of hypertension in adult patients. +Further studies should explore long-term effects and optimal dosing.""", + 'metadata': {'author': 'Medical Research Institute', 'creator': 'LaTeX'}, + }, + { + 'name': 'multi-page-academic', + 'title': 'Multi-Page Academic Paper', + 'pages': [ + """Abstract +This paper presents a comprehensive study of distributed systems. +Page 1 of 3""", + """Introduction +Distributed systems form the backbone of modern cloud computing. +We explore consistency models and their practical implications. +Page 2 of 3""", + """Conclusion +Our findings suggest new approaches to system design. +Future work will address scalability challenges. +Page 3 of 3""", + ], + 'metadata': {'author': 'Dr. Smith', 'creator': 'LaTeX'}, + }, + { + 'name': 'code-documentation', + 'title': 'Code Library Documentation', + 'content': """libpdf - PDF Processing Library +Installation +pip install libpdf +Quick Example +from libpdf import Document +doc = Document('example.pdf') +text = doc.extract_text() +API Reference +Document.open(path) +Opens a PDF file for reading. +Document.extract_text() +Extracts all text content from the document. +Document.get_page_count() +Returns the number of pages in the document. +Supported Formats +PDF 1.0 through PDF 2.0 +Encrypted PDFs (with password) +Forms and annotations +Limitations +OCR requires additional dependencies. +Very large files may require streaming mode. +License +MIT License - see LICENSE file for details.""", + 'metadata': {'author': 'Open Source Contributors', 'creator': 'Markdown'}, + }, +] + + +def main(): + """Generate all vector CER corpus fixtures.""" + print("Generating vector CER corpus fixtures...") + print(f"Target directory: {FIXTURE_DIR}") + + for fixture in FIXTURES: + name = fixture['name'] + title = fixture['title'] + metadata = fixture.get('metadata', {}) + + # Create fixture subdirectory + fixture_dir = os.path.join(FIXTURE_DIR, name) + os.makedirs(fixture_dir, exist_ok=True) + + # Create PDF + pdf_path = os.path.join(fixture_dir, 'source.pdf') + if 'pages' in fixture: + # Multi-page PDF + create_multi_page_text_pdf(pdf_path, title, fixture['pages'], metadata) + else: + # Single-page PDF + create_text_pdf(pdf_path, title, fixture['content'], metadata) + + # Create ground truth text file + gt_path = os.path.join(fixture_dir, 'ground_truth.txt') + if 'pages' in fixture: + gt_content = '\n\n'.join(fixture['pages']) + else: + gt_content = fixture['content'] + + with open(gt_path, 'w', encoding='utf-8') as f: + f.write(gt_content) + + # Create README + readme_path = os.path.join(fixture_dir, 'README.md') + with open(readme_path, 'w', encoding='utf-8') as f: + f.write(f"""# {title} - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +{gt_content[:200]}... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: {title} +- Author: {metadata.get('author', 'N/A')} +- Creator: {metadata.get('creator', 'N/A')} +- Generated by: generate_vector_cer_corpus.py +""") + + print(f" Created {name}/") + + print(f"\nGenerated {len(FIXTURES)} fixtures successfully!") + print("\nTo verify CER with pdftract:") + print(" for f in tests/fixtures/vector/*/source.pdf; do") + print(" pdftract extract \"$f\" --json /dev/null") + print(" done") + + +if __name__ == '__main__': + main() diff --git a/tests/fixtures/vector/legal-contract/README.md b/tests/fixtures/vector/legal-contract/README.md new file mode 100644 index 0000000..77bea2e --- /dev/null +++ b/tests/fixtures/vector/legal-contract/README.md @@ -0,0 +1,26 @@ +# Service Agreement - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +SERVICE AGREEMENT +This Service Agreement is entered into as of January 1, 2024. +1. Services +The Service Provider shall provide software development services to the Client. +2. Term +This agreement shall... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Service Agreement +- Author: Legal Department +- Creator: Word +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/legal-contract/ground_truth.txt b/tests/fixtures/vector/legal-contract/ground_truth.txt new file mode 100644 index 0000000..16d8a46 --- /dev/null +++ b/tests/fixtures/vector/legal-contract/ground_truth.txt @@ -0,0 +1,15 @@ +SERVICE AGREEMENT +This Service Agreement is entered into as of January 1, 2024. +1. Services +The Service Provider shall provide software development services to the Client. +2. Term +This agreement shall commence on the effective date and continue for twelve months. +3. Compensation +The Client shall pay the Service Provider $150 per hour for services rendered. +Invoices shall be submitted monthly and are due within 30 days. +4. Confidentiality +Both parties agree to keep confidential information secure and not disclose it. +5. Termination +Either party may terminate this agreement with 30 days written notice. +6. Governing Law +This agreement shall be governed by the laws of the State of California. \ No newline at end of file diff --git a/tests/fixtures/vector/legal-contract/source.pdf b/tests/fixtures/vector/legal-contract/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..aca270957d9247733fb0690ab51c0e29cda27727 GIT binary patch literal 1071 zcmY!laBIOiwLVFajzF0?K8U zGYDIn`WrlML4wv9{QJ2RzNA=(&ZX)Kq%bAD-FiGm@LMvx4qMv!BR zfUbeL);BdJGtnu(LLu4!sNT%bNWt9F2y9+n2~b(Ff(gh1F8!d?;{4L0WS}=e{&&j< zngC`PDwu*503D2QGe{52YPZaslGGvveYc#%l2n(}EDrGuRb5fdAH%l&nbb|TBm2*9oGH07 zOPu46)pP6PHrHOA-ahG+?bW%vZeI+yDP4VU_4fGl?Ok&7?yY*Y^;h8PUFMDVZk}J3 zSY=W9tWi}+?PJn;UCGYA5{2s*%XKGz)1Ep%bN}u8VykrY(wV3H(E4(D_suPDG{4-6 zlG?7wdi|(t^|EQ{+w#tYJ=Ag7(0RckqWPuMTkcgi`K1@`o0QwtD{as;d2&t1?!MU> zX&$dnELnO~tkX?x*}2Lt?r0sp8Lyoe=5bl=NKE0hl2V^p`+2XxYtz^NgkJ=As=3#E z$@>)f>(u+mTUR0$&Py$;%o4U%`dK35)3d@@`G%?TGcAW5Gj2#PoGtqz{J~u{o0kb4 z9JgleSmiEW9q=>b#~rsIG#xCC4ThWqi-ezJ4Oi zft9Jni%pwX^G~IqS^f9<5&BEE{1EY)Wb;Yv)xHJl=9|Tp)vGR^{;$tpJLi+$mBKq! z+p131N4&F9i{|0$=rc(&NvL1F>K{Y`GcDH0l0(6kAPdRRO|vIHn$aOnq^CY6A5 z2uRS7OW!H6I2D{^^gU8@%2G=*lM}i0UGsoxAu}&sK|ee*&oQq!6DEL^kSdB&)3|`i zlFI-N6wFLbjZGDR!Z2W|0A?xV!G$a>(1i@mf!P5B5UR}0fjJsY$O56Uq$n{nC$$Ki m-Gei$Qh|=p56aI6<}p}$FD?OQU$6@e4S)$*Rn^tsjSB#^JB3pK literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/medical-research/README.md b/tests/fixtures/vector/medical-research/README.md new file mode 100644 index 0000000..557c39e --- /dev/null +++ b/tests/fixtures/vector/medical-research/README.md @@ -0,0 +1,25 @@ +# Clinical Trial Results - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Clinical Trial: Drug Efficacy Study +Background +This double-blind study evaluated the efficacy of Drug X for treating hypertension. +Methodology +500 patients were randomized into treatment and placebo g... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Clinical Trial Results +- Author: Medical Research Institute +- Creator: LaTeX +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/medical-research/ground_truth.txt b/tests/fixtures/vector/medical-research/ground_truth.txt new file mode 100644 index 0000000..0ae3b21 --- /dev/null +++ b/tests/fixtures/vector/medical-research/ground_truth.txt @@ -0,0 +1,15 @@ +Clinical Trial: Drug Efficacy Study +Background +This double-blind study evaluated the efficacy of Drug X for treating hypertension. +Methodology +500 patients were randomized into treatment and placebo groups. +The study duration was 24 weeks with regular monitoring. +Results +The treatment group showed a 25% greater reduction in systolic blood pressure. +Side effects were mild and reported in less than 5% of patients. +Discussion +Drug X demonstrates significant efficacy compared to placebo. +The safety profile is favorable with minimal adverse reactions. +Conclusion +Drug X is recommended for treatment of hypertension in adult patients. +Further studies should explore long-term effects and optimal dosing. \ No newline at end of file diff --git a/tests/fixtures/vector/medical-research/source.pdf b/tests/fixtures/vector/medical-research/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..87069f1a5de2c2271ba5260f092d52cf97013597 GIT binary patch literal 1104 zcmY!laBIOiwLVFajzF0?K8U zW|#WIf0hRA+!~=hFAiOet1~HU>E&7HB|z zX(^{>i&W4pOS)8gg7P3Kj8{`$62uIvi`vyb!kC4aqVb9_S6=809jX8#t5uT|d5 z$NcS%*ZZZvuG&O?w_J7QiLZUDPuwfJPj}_~ZJ)m7`grhI7I(FluyFjn(*;6*e48~y zQ=^@vGNZ(=9nxg?ocr@+s*rx*G0|i1G7qZysa~Eq)id{#QTzT0d@E0QrW~_;*f1?a z_I;~awfDOR{pb5?LSJ|+T6}iPZ#Kh!Eh!qpn*F9xOCy6c78P8X89ygmXD8<$`HLx< z{zXSBymuehzi~)W+oh&7*TG}T^%xa4_um>!;ioLapPt*pe!pPD?k6*yzntCmNhvBy zr)vj)k(rxPx#wt%4~OgnxNY})twRfy$^O8r8Q1Dz9IW?uZ+{3 zZF^d`$hG7z+8cT_{o=Lo8=HKB4z~T*31ranl6vn@weQpI6(@scIqO%x?)}C8`PP5t z=?d(pK}i%EchJNNN(-=vhvXShO5xHEE=?){XCaWFA(y^WVsR=s4e5KN=9HzD05b-c zzH1&Zab)JDE9i%3<~imSXTk)KQdUJ#Y8n?XopKq#fr6Q-sj;a7P#6X*6~HWoJh+gh z1-g)-IWT{K078|80WjU72^j%P0jN+(QDSCJY7w|R2+piZ1v)}MC_f*V>0lY4xFoTt S1nfdX0~1RwRaIAiH!c9O^NY{` literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/multi-page-academic/README.md b/tests/fixtures/vector/multi-page-academic/README.md new file mode 100644 index 0000000..fdd9895 --- /dev/null +++ b/tests/fixtures/vector/multi-page-academic/README.md @@ -0,0 +1,27 @@ +# Multi-Page Academic Paper - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Abstract +This paper presents a comprehensive study of distributed systems. +Page 1 of 3 + +Introduction +Distributed systems form the backbone of modern cloud computing. +We explore consistency models and ... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Multi-Page Academic Paper +- Author: Dr. Smith +- Creator: LaTeX +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/multi-page-academic/ground_truth.txt b/tests/fixtures/vector/multi-page-academic/ground_truth.txt new file mode 100644 index 0000000..4b6d916 --- /dev/null +++ b/tests/fixtures/vector/multi-page-academic/ground_truth.txt @@ -0,0 +1,13 @@ +Abstract +This paper presents a comprehensive study of distributed systems. +Page 1 of 3 + +Introduction +Distributed systems form the backbone of modern cloud computing. +We explore consistency models and their practical implications. +Page 2 of 3 + +Conclusion +Our findings suggest new approaches to system design. +Future work will address scalability challenges. +Page 3 of 3 \ No newline at end of file diff --git a/tests/fixtures/vector/multi-page-academic/source.pdf b/tests/fixtures/vector/multi-page-academic/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f8656d856ca510cb26a7839645b737f39355fabf GIT binary patch literal 1541 zcmY!laBIOiwLVFajzF0?K8U zPWcrI(IAt|42=}b zEsenD<&^-H6)Tv7Ea1`)N-fSWElLJ@2;@_@e4q(nhM|HfSOL)W2oHeVMg^-a$*>w6 zAXKp15EQYb`WqBhFuUC{b4pT+6!hJ45=&BDQj_yjQn~bfQuESFG87CA4MCw*TvC*p zn9Ef$N3`3J?~s82%kS!=`xp#30-IS(kJcXO)IZ|ksq2Eyg7i zUUP15Q`=eoMXFj{);KU}V^>hGnYG|W{ou_P)1IAmh`-#I>Tv4qsk^VH9hjRZ%>SuR z-ucIK?l0Q@e;HL~s|Sn|xo&_aDC}*SBxW^w`=GE8efU`tI6d`335wcOG9^GVjN_4Hp91}UvtZ*m)^_e($sinKVvX2CQ&{#HbsdT-ZPGT zhZF?d9@;Md;UBQ}*pBNr*6QtGI5eg2(FKDE%eGHHT7CL`+Sx=Y!+8nm#g>n5Z9aHs z^N%Z2HqS`co)O999g@*Kaf?$%bM*B|bK^Y%jZ3b3*|P-o8O>OEHH;%#FFt3EPz*=w z7Vdkh-=7^RZI6%fQi+v0bK3QgWaYcf@}D=GBm&tGQt!x#uPHa6f(sWGQ$)y#}u-_6tct+ zGBiLEDk)0L%tj&lMmnc9A0dQ7LEGhwqilLE-IhU%ctG^o;0AmC0 A)Bpeg literal 0 HcmV?d00001 diff --git a/tests/fixtures/vector/scientific-report/README.md b/tests/fixtures/vector/scientific-report/README.md new file mode 100644 index 0000000..53013ad --- /dev/null +++ b/tests/fixtures/vector/scientific-report/README.md @@ -0,0 +1,25 @@ +# Climate Research Report - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Executive Summary +This report analyzes climate data collected from 50 monitoring stations. +Key findings indicate a 1.2 degree Celsius increase over the past decade. +Data Collection +Temperature reading... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Climate Research Report +- Author: Research Team +- Creator: LaTeX +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/scientific-report/ground_truth.txt b/tests/fixtures/vector/scientific-report/ground_truth.txt new file mode 100644 index 0000000..e6d5b39 --- /dev/null +++ b/tests/fixtures/vector/scientific-report/ground_truth.txt @@ -0,0 +1,15 @@ +Executive Summary +This report analyzes climate data collected from 50 monitoring stations. +Key findings indicate a 1.2 degree Celsius increase over the past decade. +Data Collection +Temperature readings were recorded hourly from January to December 2023. +The monitoring stations are located across diverse geographic regions. +Analysis +Linear regression was applied to identify temperature trends. +Confidence intervals were calculated at the 95% level. +Findings +The data shows consistent warming across all monitoring stations. +Urban areas show higher temperature increases compared to rural locations. +Recommendations +We recommend continued monitoring and expanded data collection efforts. +Immediate action should be taken to reduce carbon emissions. \ No newline at end of file diff --git a/tests/fixtures/vector/scientific-report/source.pdf b/tests/fixtures/vector/scientific-report/source.pdf new file mode 100644 index 0000000..590c9c7 --- /dev/null +++ b/tests/fixtures/vector/scientific-report/source.pdf @@ -0,0 +1,63 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +/Title (Climate Research Report) +/Author (Research Team) +/Creator (LaTeX) +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/MediaBox [0 0 612 792] +/Contents 4 0 R +/Resources << +/Font << +/F1 5 0 R +>> +>> +endobj +4 0 obj +<< +/Filter /FlateDecode +/Length 444 +>> +stream +xmRK0W̅kGJr S{ v˯g=DV<Ѽnd~3dkŌ gZ2X Â`d 9$P"$g B8:VvBkZ!De!X1=e`롳*)$X*p&qlIU |Xd)c0WzK;B:@+)gp™,.llN?2g#P:3*ںzV$gGZn,X@ + ՖEorʖϧwD~0O׌77[AyIgLq.J*$4W/ە#^U7Xx^Z#7jwªg7cR}ϬC}&BPc}l[MZX Ck94Tܯx{I}:wpM j[jrlWo]qs# +endstream +endobj +5 0 obj +<< +/Type /Font +/Subtype /Type1 +/BaseFont /Helvetica +/Encoding /WinAnsiEncoding +>> +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000098 00000 n +0000000173 00000 n +0000000803 00000 n +0000000820 00000 n +trailer +<< +/Size 6 +/Root 1 0 R +>> +startxref +1048 +%%EOF diff --git a/tests/fixtures/vector/technical-documentation/README.md b/tests/fixtures/vector/technical-documentation/README.md new file mode 100644 index 0000000..254f6ad --- /dev/null +++ b/tests/fixtures/vector/technical-documentation/README.md @@ -0,0 +1,25 @@ +# API Documentation - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Getting Started +To use the API, first obtain an authentication token from the dashboard. +Include this token in the Authorization header of all requests. +Authentication +All API requests require authent... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: API Documentation +- Author: API Team +- Creator: Word +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/technical-documentation/ground_truth.txt b/tests/fixtures/vector/technical-documentation/ground_truth.txt new file mode 100644 index 0000000..4606d38 --- /dev/null +++ b/tests/fixtures/vector/technical-documentation/ground_truth.txt @@ -0,0 +1,15 @@ +Getting Started +To use the API, first obtain an authentication token from the dashboard. +Include this token in the Authorization header of all requests. +Authentication +All API requests require authentication using a Bearer token. +Tokens expire after 24 hours and must be refreshed. +Endpoints +GET /api/users - Retrieve a list of users +POST /api/users - Create a new user +GET /api/users/:id - Retrieve a specific user +PUT /api/users/:id - Update a user +DELETE /api/users/:id - Delete a user +Rate Limits +The API has a rate limit of 1000 requests per hour per user. +Exceeding this limit will result in a 429 Too Many Requests response. \ No newline at end of file diff --git a/tests/fixtures/vector/technical-documentation/source.pdf b/tests/fixtures/vector/technical-documentation/source.pdf new file mode 100644 index 0000000..dd4bf09 --- /dev/null +++ b/tests/fixtures/vector/technical-documentation/source.pdf @@ -0,0 +1,63 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +/Title (API Documentation) +/Author (API Team) +/Creator (Word) +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/MediaBox [0 0 612 792] +/Contents 4 0 R +/Resources << +/Font << +/F1 5 0 R +>> +>> +endobj +4 0 obj +<< +/Filter /FlateDecode +/Length 368 +>> +stream +xmN0)c$$ (띻:v>&[uܩ! G +q-R9@6AZH G5d`wAOǒ T@@ohrvF*30eH"ccB2: 5*li{>LL.~ @/1)*tqƩmK1ԶsSiwrL9@atk/慄jiͰXOIͅ +hL=so޽޶&ˬXa':N,iOHۅZqzpQi > +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000098 00000 n +0000000173 00000 n +0000000715 00000 n +0000000732 00000 n +trailer +<< +/Size 6 +/Root 1 0 R +>> +startxref +960 +%%EOF diff --git a/tests/fixtures/vector/user-manual/README.md b/tests/fixtures/vector/user-manual/README.md new file mode 100644 index 0000000..b518ca9 --- /dev/null +++ b/tests/fixtures/vector/user-manual/README.md @@ -0,0 +1,25 @@ +# Product User Manual - CER Test Fixture + +## Purpose +This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus. + +## Files +- `source.pdf` - Clean vector PDF with embedded text +- `ground_truth.txt` - Exact text content for CER comparison +- `README.md` - This file + +## Content +Quick Start Guide +Thank you for purchasing our product. This guide will help you get started. +Unboxing +Carefully remove the product from the packaging. +Check that all items listed on the included card... + +## Expected CER +Target: < 0.5% character error rate when extracted by pdftract. + +## Metadata +- Title: Product User Manual +- Author: Product Team +- Creator: Word +- Generated by: generate_vector_cer_corpus.py diff --git a/tests/fixtures/vector/user-manual/ground_truth.txt b/tests/fixtures/vector/user-manual/ground_truth.txt new file mode 100644 index 0000000..9532555 --- /dev/null +++ b/tests/fixtures/vector/user-manual/ground_truth.txt @@ -0,0 +1,19 @@ +Quick Start Guide +Thank you for purchasing our product. This guide will help you get started. +Unboxing +Carefully remove the product from the packaging. +Check that all items listed on the included card are present. +Setup +1. Connect the power adapter to a wall outlet. +2. Press and hold the power button for 3 seconds. +3. Follow the on-screen instructions to complete setup. +Features +- Wireless connectivity +- Touch screen interface +- Long battery life +- Compact design +Troubleshooting +If the device does not turn on, ensure the battery is charged. +For connection issues, restart your router and try again. +Support +For additional help, visit support.example.com or call 1-800-SUPPORT. \ No newline at end of file diff --git a/tests/fixtures/vector/user-manual/source.pdf b/tests/fixtures/vector/user-manual/source.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f1d527f9a58ea5e386e86b7ed0470f257173c289 GIT binary patch literal 1103 zcmY!laBIOiwLVFajzF0?K8U zIthJuNy0Z3PI3DEPoTorSC zgKqcTRuHItuKkQzsC@J4Rke9rR%(T2-Kw~n&Xhd6%|MLji!q(wS-=-x z^*M3-!n4L-n}7arng8MIy+3OX&V8}KOuKgN!Obt#XZw{*_G(U=^?Km^%!J3R?XPcH>YMC=9-NRtQhpRJXqt{L_%Kjy5bX@J#JHNxKFL+s+lX{cDfN>isQf=W)`UdYa2s&vg4GW5)`}#kxK0f(301Cr^r2?3xkOvpC zv_Ka!GzVrF5J0H1Fu)Wt0%jAas*<9_%$(FBaB&cvS(OTOgnm$dJ}?);azt@SVo?d$ Qg@y(u7F?>TuKsRZ06Wx|tpET3 literal 0 HcmV?d00001