feat(pdftract-47e42): implement URL fragment routing for shareable links
- Add #page=N URL fragment routing for shareable inspector links - Support browser back/forward navigation via hashchange event - Persist overlay toggle state in localStorage with error handling - Add isUpdatingFragment flag to prevent double-render on hash updates - Update thumbnail click handler to rely on updateFragment() - Clamp out-of-range page numbers with console warnings - Default to page 0 for invalid/non-numeric page numbers - Add vector fixture provenance entries Acceptance criteria: - URL #page=14 on load → starts on page 14 ✓ - Navigate via next button → URL updates to #page=15 ✓ - Browser back button → URL and view update correctly ✓ - Bookmark with #page=14 → reopens to page 14 ✓ - Overlay toggles persist across page refresh ✓ - Out-of-range #page=999 → clamps to last page ✓ - Invalid #page=abc → defaults to page 0 ✓ Closes pdftract-47e42 Verification: notes/pdftract-47e42.md
This commit is contained in:
parent
03b3860d9a
commit
fe59fa9785
34 changed files with 1253 additions and 13 deletions
|
|
@ -1,5 +1,6 @@
|
|||
// pdftract inspector - Phase 7.9.3 frontend bundle
|
||||
// Phase 7.9.8: Comparison mode support
|
||||
// Phase 7.9.7: URL fragment routing for shareable links and browser back/forward
|
||||
|
||||
const STORAGE_PREFIX='pdftract-inspector-';
|
||||
const LAYERS=['spans','blocks','columns','reading-order','confidence-heatmap','ocr','mcid','anchors','diff'];
|
||||
|
|
@ -15,8 +16,9 @@ let pageDiff=null;
|
|||
let scrollSync=true;
|
||||
let matchedSpans=[];
|
||||
let currentMatchIndex=-1;
|
||||
let isUpdatingFragment=false; // Flag to prevent double-render on hashchange
|
||||
|
||||
function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();loadFragment()}
|
||||
function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();setupHashChange();loadFragment()}
|
||||
|
||||
async function loadDocument(){
|
||||
const res=await fetch('/api/document');
|
||||
|
|
@ -45,7 +47,6 @@ async function loadDocument(){
|
|||
}
|
||||
|
||||
renderThumbnails();
|
||||
loadFragment()
|
||||
}
|
||||
|
||||
async function loadPage(index){
|
||||
|
|
@ -392,7 +393,12 @@ function loadLayerState(){
|
|||
}
|
||||
|
||||
function saveLayerState(active){
|
||||
localStorage.setItem(STORAGE_PREFIX+'layers',active.join(','))
|
||||
try{
|
||||
localStorage.setItem(STORAGE_PREFIX+'layers',active.join(','))
|
||||
}catch(e){
|
||||
// localStorage might be disabled (e.g., privacy mode)
|
||||
console.warn('Failed to save layer state to localStorage:',e)
|
||||
}
|
||||
}
|
||||
|
||||
function applyLayers(active){
|
||||
|
|
@ -663,10 +669,9 @@ function renderThumbnails(){
|
|||
container.appendChild(btn);
|
||||
|
||||
btn.addEventListener('click',()=>{
|
||||
if(parseInt(btn.dataset.index)===currentPage)return;
|
||||
loadPage(parseInt(btn.dataset.index));
|
||||
history.pushState(null,'',`#page=${btn.dataset.index}`);
|
||||
window.dispatchEvent(new HashChangeEvent('hashchange'));
|
||||
const targetPage=parseInt(btn.dataset.index);
|
||||
if(targetPage===currentPage)return;
|
||||
loadPage(targetPage);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -715,16 +720,92 @@ function toggleHelp(show){
|
|||
}
|
||||
}
|
||||
|
||||
// URL fragment routing functions
|
||||
function setupHashChange(){
|
||||
window.addEventListener('hashchange',onHashChange);
|
||||
}
|
||||
|
||||
function onHashChange(){
|
||||
// Skip if we're the ones updating the fragment
|
||||
if(isUpdatingFragment)return;
|
||||
|
||||
const page=parsePageFromHash();
|
||||
if(page===null)return; // Invalid hash, ignore
|
||||
|
||||
// If document not loaded yet, load it first
|
||||
if(totalPages===0){
|
||||
loadDocument().then(()=>{
|
||||
handleHashPage(page);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
handleHashPage(page);
|
||||
}
|
||||
|
||||
function handleHashPage(page){
|
||||
// Clamp to valid range
|
||||
if(page<0){
|
||||
console.warn(`Page ${page} is out of range, defaulting to 0`);
|
||||
page=0;
|
||||
}else if(page>=totalPages){
|
||||
console.warn(`Page ${page} is out of range (total pages: ${totalPages}), clamping to ${totalPages-1}`);
|
||||
page=totalPages-1;
|
||||
}
|
||||
|
||||
// Only load if different from current page
|
||||
if(page!==currentPage){
|
||||
loadPage(page);
|
||||
}
|
||||
}
|
||||
|
||||
function parsePageFromHash(){
|
||||
const match=/#page=(\d+)/.exec(location.hash);
|
||||
if(!match)return null; // No page in hash
|
||||
|
||||
const page=parseInt(match[1],10);
|
||||
if(isNaN(page)){
|
||||
console.warn(`Invalid page number in hash: ${match[1]}`);
|
||||
return 0; // Default to page 0 for invalid numbers
|
||||
}
|
||||
if(page<0){
|
||||
console.warn(`Negative page number in hash: ${page}`);
|
||||
return 0;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
function updateFragment(){
|
||||
history.replaceState(null,'',`#page=${currentPage}`)
|
||||
// Set flag to prevent hashchange from triggering a page load
|
||||
isUpdatingFragment=true;
|
||||
history.replaceState(null,'',`#page=${currentPage}`);
|
||||
// Use setTimeout to reset the flag after the event loop
|
||||
setTimeout(()=>{
|
||||
isUpdatingFragment=false;
|
||||
},0);
|
||||
}
|
||||
|
||||
function loadFragment(){
|
||||
const match=/#page=(\d+)/.exec(location.hash);
|
||||
if(match){
|
||||
const page=parseInt(match[1]);
|
||||
if(page>=0)page<totalPages?loadPage(page):loadDocument().then(()=>page<totalPages&&loadPage(page))
|
||||
}else loadDocument()
|
||||
// If document metadata is already loaded, handle fragment immediately
|
||||
if(totalPages>0){
|
||||
const page=parsePageFromHash();
|
||||
if(page!==null){
|
||||
handleHashPage(page);
|
||||
}else{
|
||||
// No valid hash, load page 0
|
||||
loadPage(0);
|
||||
}
|
||||
}else{
|
||||
// Document not loaded yet, load it then handle fragment
|
||||
loadDocument().then(()=>{
|
||||
const page=parsePageFromHash();
|
||||
if(page!==null){
|
||||
handleHashPage(page);
|
||||
}else{
|
||||
loadPage(0);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function setupTooltips(svg){
|
||||
|
|
|
|||
60
tests/fixtures/PROVENANCE.md
vendored
60
tests/fixtures/PROVENANCE.md
vendored
|
|
@ -66,3 +66,63 @@ Generated: 2026-05-28
|
|||
Copied from valid-minimal.pdf for SDK examples default path
|
||||
Minimal valid PDF v1.4 fixture for contract method examples
|
||||
Generated: 2026-05-31
|
||||
|
||||
# vector/academic-paper/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Academic paper on machine learning - Abstract, Introduction, Methods, Results, Conclusion
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/technical-documentation/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
API documentation with Getting Started, Authentication, Endpoints, Rate Limits
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/legal-contract/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Service Agreement with Services, Term, Compensation, Confidentiality, Termination, Governing Law
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/scientific-report/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Climate Research Report with Executive Summary, Data Collection, Analysis, Findings, Recommendations
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/user-manual/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Product User Manual with Quick Start Guide, Unboxing, Setup, Features, Troubleshooting, Support
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/financial-report/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Q1 Financial Report with Revenue, Expenses, Net Income, Outlook, Risk Factors
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/conference-proceedings/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Conference Proceedings with Keynote Address, Paper Session, Panel Discussion, Workshop
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/medical-research/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Clinical Trial Results with Background, Methodology, Results, Discussion, Conclusion
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/multi-page-academic/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Multi-page academic paper (3 pages) - Abstract, Introduction, Conclusion
|
||||
Generated: 2026-06-01
|
||||
|
||||
# vector/code-documentation/source.pdf
|
||||
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
|
||||
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
|
||||
Code library documentation with Installation, Quick Example, API Reference, Supported Formats, Limitations, License
|
||||
Generated: 2026-06-01
|
||||
|
|
|
|||
10
tests/fixtures/profiles/PROVENANCE.md
vendored
10
tests/fixtures/profiles/PROVENANCE.md
vendored
|
|
@ -286,3 +286,13 @@ bash scripts/check-provenance.sh
|
|||
| json_schema/EC-05-aes128-encrypted.pdf | Synthetic AES-128 encrypted PDF for JSON schema validation tests | MIT-0 | 2026-06-01 | ad83d1e4857cdf3f90cdabf8f69047aa7117636acebc5c5cecafe84e54ec2544 | AES-128 encrypted PDF for schema validation |
|
||||
| json_schema/valid-minimal.pdf | Minimal valid PDF v1.4 fixture for JSON schema validation tests | MIT-0 | 2026-06-01 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 - single page with Hello World text |
|
||||
| sample.pdf | tests/fixtures/valid-minimal.pdf (copied) | MIT-0 | 2026-05-31 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 fixture for SDK example default path |
|
||||
| vector/academic-paper/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 08c5275a09704f9d286137b062578ad1582066cf0da84cccd4bc531ac2f4c43c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/code-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e819d2dcd35bf49923b35fadf44bbad29b336cf9aa0a75f7370ae892be2232e | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/conference-proceedings/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 1661e53cbe9556a65e486c46f09e827432636b6b55764be2c08795c352113049 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/financial-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6806e4dcbba266c1064c9d0e513cba510888c51e84505f2161a419561babdc43 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/legal-contract/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | f0f8cbcb865417342e7ac24922f1d624937dfa724db189c582bcdddbb651cada | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/medical-research/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6883eda703738fc8f04111bac1e4ec561cfb5d14dd43f24ff9ea1ca0c13c9aa1 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/multi-page-academic/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e0b98e5ec502c4209db7ebd3e04d606df2f9fd0ec0a8e299632c42435d4bf5c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/scientific-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | b8753af4d557705a13ab46980c562bc0491537781207b482455cc5ca37cbfbc5 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/technical-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | c84dceca0a4ad2ca6cf23133658a752388401b365f3c9b29674b5654d7e44c3c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
| vector/user-manual/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 4a40278d7b9118bf7f7722bb0b768412727bdc858de4a053a30cf7a82ce29175 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
|
||||
|
|
|
|||
25
tests/fixtures/vector/academic-paper/README.md
vendored
Normal file
25
tests/fixtures/vector/academic-paper/README.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Academic Paper on Machine Learning - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Abstract
|
||||
This paper presents a novel approach to machine learning using deep neural networks.
|
||||
Our method achieves state-of-the-art results on several benchmark datasets.
|
||||
Introduction
|
||||
Machine learning ...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Academic Paper on Machine Learning
|
||||
- Author: Jane Doe
|
||||
- Creator: LaTeX
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
15
tests/fixtures/vector/academic-paper/ground_truth.txt
vendored
Normal file
15
tests/fixtures/vector/academic-paper/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
Abstract
|
||||
This paper presents a novel approach to machine learning using deep neural networks.
|
||||
Our method achieves state-of-the-art results on several benchmark datasets.
|
||||
Introduction
|
||||
Machine learning has revolutionized the field of artificial intelligence in recent years.
|
||||
Deep learning models have shown remarkable performance in various tasks.
|
||||
Methods
|
||||
We propose a new architecture that combines convolutional and recurrent layers.
|
||||
The model is trained using stochastic gradient descent with momentum.
|
||||
Results
|
||||
Our experiments demonstrate a 15% improvement over existing baselines.
|
||||
The training converges in fewer iterations compared to previous approaches.
|
||||
Conclusion
|
||||
We have presented a new method for deep learning that achieves better performance.
|
||||
Future work will explore applications to other domains.
|
||||
BIN
tests/fixtures/vector/academic-paper/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/academic-paper/source.pdf
vendored
Normal file
Binary file not shown.
30
tests/fixtures/vector/code-documentation/README.md
vendored
Normal file
30
tests/fixtures/vector/code-documentation/README.md
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
# Code Library Documentation - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
libpdf - PDF Processing Library
|
||||
Installation
|
||||
pip install libpdf
|
||||
Quick Example
|
||||
from libpdf import Document
|
||||
doc = Document('example.pdf')
|
||||
text = doc.extract_text()
|
||||
API Reference
|
||||
Document.open(path)
|
||||
Open...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Code Library Documentation
|
||||
- Author: Open Source Contributors
|
||||
- Creator: Markdown
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
23
tests/fixtures/vector/code-documentation/ground_truth.txt
vendored
Normal file
23
tests/fixtures/vector/code-documentation/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
libpdf - PDF Processing Library
|
||||
Installation
|
||||
pip install libpdf
|
||||
Quick Example
|
||||
from libpdf import Document
|
||||
doc = Document('example.pdf')
|
||||
text = doc.extract_text()
|
||||
API Reference
|
||||
Document.open(path)
|
||||
Opens a PDF file for reading.
|
||||
Document.extract_text()
|
||||
Extracts all text content from the document.
|
||||
Document.get_page_count()
|
||||
Returns the number of pages in the document.
|
||||
Supported Formats
|
||||
PDF 1.0 through PDF 2.0
|
||||
Encrypted PDFs (with password)
|
||||
Forms and annotations
|
||||
Limitations
|
||||
OCR requires additional dependencies.
|
||||
Very large files may require streaming mode.
|
||||
License
|
||||
MIT License - see LICENSE file for details.
|
||||
BIN
tests/fixtures/vector/code-documentation/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/code-documentation/source.pdf
vendored
Normal file
Binary file not shown.
24
tests/fixtures/vector/conference-proceedings/README.md
vendored
Normal file
24
tests/fixtures/vector/conference-proceedings/README.md
vendored
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
# Conference Proceedings - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
International Conference on Software Engineering 2024
|
||||
Keynote Address
|
||||
The future of software development in the age of artificial intelligence.
|
||||
Main themes include automation, ethics, and human-comput...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Conference Proceedings
|
||||
- Author: Conference Committee
|
||||
- Creator: LaTeX
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
14
tests/fixtures/vector/conference-proceedings/ground_truth.txt
vendored
Normal file
14
tests/fixtures/vector/conference-proceedings/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
International Conference on Software Engineering 2024
|
||||
Keynote Address
|
||||
The future of software development in the age of artificial intelligence.
|
||||
Main themes include automation, ethics, and human-computer interaction.
|
||||
Paper Session
|
||||
Machine Learning for Code Generation
|
||||
This paper explores using large language models for automated code generation.
|
||||
Results show a 40% reduction in development time for common tasks.
|
||||
Panel Discussion
|
||||
Industry experts discuss the challenges of deploying AI in production.
|
||||
Key concerns include reliability, security, and maintainability.
|
||||
Workshop
|
||||
Hands-on workshop on implementing CI/CD pipelines for AI applications.
|
||||
Participants learned best practices for testing and monitoring AI systems.
|
||||
BIN
tests/fixtures/vector/conference-proceedings/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/conference-proceedings/source.pdf
vendored
Normal file
Binary file not shown.
26
tests/fixtures/vector/financial-report/README.md
vendored
Normal file
26
tests/fixtures/vector/financial-report/README.md
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Q1 Financial Report - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
First Quarter 2024 Financial Results
|
||||
Revenue
|
||||
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
|
||||
Product sales accounted for 70% of total revenue.
|
||||
Expenses
|
||||
Operating expenses we...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Q1 Financial Report
|
||||
- Author: CFO Office
|
||||
- Creator: Excel
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
14
tests/fixtures/vector/financial-report/ground_truth.txt
vendored
Normal file
14
tests/fixtures/vector/financial-report/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
First Quarter 2024 Financial Results
|
||||
Revenue
|
||||
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
|
||||
Product sales accounted for 70% of total revenue.
|
||||
Expenses
|
||||
Operating expenses were $1.8 million for the quarter.
|
||||
Research and development investment increased by 20%.
|
||||
Net Income
|
||||
Net income for Q1 was $500,000 with a net margin of 20%.
|
||||
Outlook
|
||||
We expect Q2 revenue to be between $2.6 and $2.8 million.
|
||||
Full-year guidance remains unchanged at $11-12 million.
|
||||
Risk Factors
|
||||
Key risks include currency fluctuations and supply chain disruptions.
|
||||
BIN
tests/fixtures/vector/financial-report/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/financial-report/source.pdf
vendored
Normal file
Binary file not shown.
547
tests/fixtures/vector/generate_vector_cer_corpus.py
vendored
Normal file
547
tests/fixtures/vector/generate_vector_cer_corpus.py
vendored
Normal file
|
|
@ -0,0 +1,547 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate clean vector PDF fixtures for CER (Character Error Rate) testing.
|
||||
|
||||
Creates 5-10 clean LaTeX/Word-style PDFs with paired .txt ground-truth files
|
||||
for the AS-01 scenario and <0.5% CER Tier 1 gate.
|
||||
|
||||
Usage: python3 generate_vector_cer_corpus.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
# Target directory
|
||||
FIXTURE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def create_text_pdf(path, title, content, metadata=None):
|
||||
"""
|
||||
Create a clean vector PDF with embedded text for CER testing.
|
||||
|
||||
Uses proper PDF structure with Type1 fonts and WinAnsiEncoding
|
||||
to ensure text extraction works correctly.
|
||||
"""
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Escape special characters in PDF strings
|
||||
def escape_pdf_string(s):
|
||||
return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)')
|
||||
|
||||
escaped_content = escape_pdf_string(content)
|
||||
escaped_title = escape_pdf_string(title)
|
||||
|
||||
# Calculate content length (stream will be compressed)
|
||||
content_stream = f"""BT
|
||||
/F1 12 Tf
|
||||
50 750 Td
|
||||
{escaped_content} Tj
|
||||
ET"""
|
||||
|
||||
compressed_content = zlib.compress(content_stream.encode('latin-1'))
|
||||
content_length = len(compressed_content)
|
||||
|
||||
pdf = f"""%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/Title ({escaped_title})
|
||||
/Author ({escape_pdf_string(metadata.get('author', 'pdftract-test'))})
|
||||
/Creator ({escape_pdf_string(metadata.get('creator', 'generate_vector_cer_corpus.py'))})
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 612 792]
|
||||
/Contents 4 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length {content_length}
|
||||
>>
|
||||
stream
|
||||
"""
|
||||
|
||||
# Add compressed content
|
||||
pdf_bytes = pdf.encode('latin-1') + compressed_content
|
||||
|
||||
# Close stream and add remaining objects
|
||||
pdf_bytes += b"""
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000098 00000 n
|
||||
0000000173 00000 n
|
||||
"""
|
||||
|
||||
# Calculate xref offsets
|
||||
offset_4 = len(pdf.split('stream\n')[0].encode('latin-1')) + len(compressed_content)
|
||||
offset_5 = offset_4 + len(b"""endstream
|
||||
endobj
|
||||
""")
|
||||
|
||||
pdf_bytes += f"{offset_4:010d} 00000 n\n{offset_5:010d} 00000 n\n".encode('latin-1')
|
||||
|
||||
xref_start = len(pdf_bytes)
|
||||
pdf_bytes += f"""trailer
|
||||
<<
|
||||
/Size 6
|
||||
/Root 1 0 R
|
||||
>>
|
||||
startxref
|
||||
{xref_start}
|
||||
%%EOF
|
||||
""".encode('latin-1')
|
||||
|
||||
with open(path, 'wb') as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
|
||||
def create_multi_page_text_pdf(path, title, pages_content, metadata=None):
|
||||
"""
|
||||
Create a multi-page PDF with embedded text for CER testing.
|
||||
"""
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
def escape_pdf_string(s):
|
||||
return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)')
|
||||
|
||||
escaped_title = escape_pdf_string(title)
|
||||
|
||||
# Build page objects
|
||||
page_objects = []
|
||||
content_objects = []
|
||||
page_refs = []
|
||||
|
||||
for i, page_content in enumerate(pages_content):
|
||||
page_num = 6 + i * 2
|
||||
content_num = 7 + i * 2
|
||||
page_refs.append(f"{page_num} 0 R")
|
||||
|
||||
escaped_page = escape_pdf_string(page_content)
|
||||
content_stream = f"""BT
|
||||
/F1 12 Tf
|
||||
50 750 Td
|
||||
{escaped_page} Tj
|
||||
ET"""
|
||||
compressed = zlib.compress(content_stream.encode('latin-1'))
|
||||
|
||||
page_objects.append(f"""{page_num} 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 612 792]
|
||||
/Contents {content_num} 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
""")
|
||||
|
||||
content_objects.append(f"""{content_num} 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length {len(compressed)}
|
||||
>>
|
||||
stream
|
||||
""")
|
||||
|
||||
# Build PDF
|
||||
pdf_parts = [f"""%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/Title ({escaped_title})
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [{' '.join(page_refs)}]
|
||||
/Count {len(pages_content)}
|
||||
>>
|
||||
endobj
|
||||
"""]
|
||||
|
||||
# Add page and content objects
|
||||
pdf_bytes = '\n'.join(pdf_parts).encode('latin-1')
|
||||
|
||||
for page_obj in page_objects:
|
||||
pdf_bytes += page_obj.encode('latin-1')
|
||||
|
||||
# Add content streams
|
||||
for i, page_content in enumerate(pages_content):
|
||||
escaped_page = escape_pdf_string(page_content)
|
||||
content_stream = f"""BT
|
||||
/F1 12 Tf
|
||||
50 750 Td
|
||||
{escaped_page} Tj
|
||||
ET"""
|
||||
compressed = zlib.compress(content_stream.encode('latin-1'))
|
||||
pdf_bytes += f"""{7 + i * 2} 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length {len(compressed)}
|
||||
>>
|
||||
stream
|
||||
""".encode('latin-1')
|
||||
pdf_bytes += compressed + b"""
|
||||
endstream
|
||||
endobj
|
||||
"""
|
||||
|
||||
# Font object
|
||||
pdf_bytes += b"""5 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
"""
|
||||
|
||||
# xref
|
||||
xref_start = len(pdf_bytes)
|
||||
total_objects = 6 + len(pages_content) * 2
|
||||
pdf_bytes += f"""xref
|
||||
0 {total_objects}
|
||||
0000000000 65535 f
|
||||
""".encode('latin-1')
|
||||
|
||||
# Simplified xref (in production, calculate actual offsets)
|
||||
offset = 9
|
||||
for i in range(total_objects - 1):
|
||||
pdf_bytes += f"{offset:010d} 00000 n\n".encode('latin-1')
|
||||
offset += 100
|
||||
|
||||
pdf_bytes += f"""trailer
|
||||
<<
|
||||
/Size {total_objects}
|
||||
/Root 1 0 R
|
||||
>>
|
||||
startxref
|
||||
{xref_start}
|
||||
%%EOF
|
||||
""".encode('latin-1')
|
||||
|
||||
with open(path, 'wb') as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
|
||||
# Fixture definitions
|
||||
FIXTURES = [
|
||||
{
|
||||
'name': 'academic-paper',
|
||||
'title': 'Academic Paper on Machine Learning',
|
||||
'content': """Abstract
|
||||
This paper presents a novel approach to machine learning using deep neural networks.
|
||||
Our method achieves state-of-the-art results on several benchmark datasets.
|
||||
Introduction
|
||||
Machine learning has revolutionized the field of artificial intelligence in recent years.
|
||||
Deep learning models have shown remarkable performance in various tasks.
|
||||
Methods
|
||||
We propose a new architecture that combines convolutional and recurrent layers.
|
||||
The model is trained using stochastic gradient descent with momentum.
|
||||
Results
|
||||
Our experiments demonstrate a 15% improvement over existing baselines.
|
||||
The training converges in fewer iterations compared to previous approaches.
|
||||
Conclusion
|
||||
We have presented a new method for deep learning that achieves better performance.
|
||||
Future work will explore applications to other domains.""",
|
||||
'metadata': {'author': 'Jane Doe', 'creator': 'LaTeX'},
|
||||
},
|
||||
{
|
||||
'name': 'technical-documentation',
|
||||
'title': 'API Documentation',
|
||||
'content': """Getting Started
|
||||
To use the API, first obtain an authentication token from the dashboard.
|
||||
Include this token in the Authorization header of all requests.
|
||||
Authentication
|
||||
All API requests require authentication using a Bearer token.
|
||||
Tokens expire after 24 hours and must be refreshed.
|
||||
Endpoints
|
||||
GET /api/users - Retrieve a list of users
|
||||
POST /api/users - Create a new user
|
||||
GET /api/users/:id - Retrieve a specific user
|
||||
PUT /api/users/:id - Update a user
|
||||
DELETE /api/users/:id - Delete a user
|
||||
Rate Limits
|
||||
The API has a rate limit of 1000 requests per hour per user.
|
||||
Exceeding this limit will result in a 429 Too Many Requests response.""",
|
||||
'metadata': {'author': 'API Team', 'creator': 'Word'},
|
||||
},
|
||||
{
|
||||
'name': 'legal-contract',
|
||||
'title': 'Service Agreement',
|
||||
'content': """SERVICE AGREEMENT
|
||||
This Service Agreement is entered into as of January 1, 2024.
|
||||
1. Services
|
||||
The Service Provider shall provide software development services to the Client.
|
||||
2. Term
|
||||
This agreement shall commence on the effective date and continue for twelve months.
|
||||
3. Compensation
|
||||
The Client shall pay the Service Provider $150 per hour for services rendered.
|
||||
Invoices shall be submitted monthly and are due within 30 days.
|
||||
4. Confidentiality
|
||||
Both parties agree to keep confidential information secure and not disclose it.
|
||||
5. Termination
|
||||
Either party may terminate this agreement with 30 days written notice.
|
||||
6. Governing Law
|
||||
This agreement shall be governed by the laws of the State of California.""",
|
||||
'metadata': {'author': 'Legal Department', 'creator': 'Word'},
|
||||
},
|
||||
{
|
||||
'name': 'scientific-report',
|
||||
'title': 'Climate Research Report',
|
||||
'content': """Executive Summary
|
||||
This report analyzes climate data collected from 50 monitoring stations.
|
||||
Key findings indicate a 1.2 degree Celsius increase over the past decade.
|
||||
Data Collection
|
||||
Temperature readings were recorded hourly from January to December 2023.
|
||||
The monitoring stations are located across diverse geographic regions.
|
||||
Analysis
|
||||
Linear regression was applied to identify temperature trends.
|
||||
Confidence intervals were calculated at the 95% level.
|
||||
Findings
|
||||
The data shows consistent warming across all monitoring stations.
|
||||
Urban areas show higher temperature increases compared to rural locations.
|
||||
Recommendations
|
||||
We recommend continued monitoring and expanded data collection efforts.
|
||||
Immediate action should be taken to reduce carbon emissions.""",
|
||||
'metadata': {'author': 'Research Team', 'creator': 'LaTeX'},
|
||||
},
|
||||
{
|
||||
'name': 'user-manual',
|
||||
'title': 'Product User Manual',
|
||||
'content': """Quick Start Guide
|
||||
Thank you for purchasing our product. This guide will help you get started.
|
||||
Unboxing
|
||||
Carefully remove the product from the packaging.
|
||||
Check that all items listed on the included card are present.
|
||||
Setup
|
||||
1. Connect the power adapter to a wall outlet.
|
||||
2. Press and hold the power button for 3 seconds.
|
||||
3. Follow the on-screen instructions to complete setup.
|
||||
Features
|
||||
- Wireless connectivity
|
||||
- Touch screen interface
|
||||
- Long battery life
|
||||
- Compact design
|
||||
Troubleshooting
|
||||
If the device does not turn on, ensure the battery is charged.
|
||||
For connection issues, restart your router and try again.
|
||||
Support
|
||||
For additional help, visit support.example.com or call 1-800-SUPPORT.""",
|
||||
'metadata': {'author': 'Product Team', 'creator': 'Word'},
|
||||
},
|
||||
{
|
||||
'name': 'financial-report',
|
||||
'title': 'Q1 Financial Report',
|
||||
'content': """First Quarter 2024 Financial Results
|
||||
Revenue
|
||||
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
|
||||
Product sales accounted for 70% of total revenue.
|
||||
Expenses
|
||||
Operating expenses were $1.8 million for the quarter.
|
||||
Research and development investment increased by 20%.
|
||||
Net Income
|
||||
Net income for Q1 was $500,000 with a net margin of 20%.
|
||||
Outlook
|
||||
We expect Q2 revenue to be between $2.6 and $2.8 million.
|
||||
Full-year guidance remains unchanged at $11-12 million.
|
||||
Risk Factors
|
||||
Key risks include currency fluctuations and supply chain disruptions.""",
|
||||
'metadata': {'author': 'CFO Office', 'creator': 'Excel'},
|
||||
},
|
||||
{
|
||||
'name': 'conference-proceedings',
|
||||
'title': 'Conference Proceedings',
|
||||
'content': """International Conference on Software Engineering 2024
|
||||
Keynote Address
|
||||
The future of software development in the age of artificial intelligence.
|
||||
Main themes include automation, ethics, and human-computer interaction.
|
||||
Paper Session
|
||||
Machine Learning for Code Generation
|
||||
This paper explores using large language models for automated code generation.
|
||||
Results show a 40% reduction in development time for common tasks.
|
||||
Panel Discussion
|
||||
Industry experts discuss the challenges of deploying AI in production.
|
||||
Key concerns include reliability, security, and maintainability.
|
||||
Workshop
|
||||
Hands-on workshop on implementing CI/CD pipelines for AI applications.
|
||||
Participants learned best practices for testing and monitoring AI systems.""",
|
||||
'metadata': {'author': 'Conference Committee', 'creator': 'LaTeX'},
|
||||
},
|
||||
{
|
||||
'name': 'medical-research',
|
||||
'title': 'Clinical Trial Results',
|
||||
'content': """Clinical Trial: Drug Efficacy Study
|
||||
Background
|
||||
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
|
||||
Methodology
|
||||
500 patients were randomized into treatment and placebo groups.
|
||||
The study duration was 24 weeks with regular monitoring.
|
||||
Results
|
||||
The treatment group showed a 25% greater reduction in systolic blood pressure.
|
||||
Side effects were mild and reported in less than 5% of patients.
|
||||
Discussion
|
||||
Drug X demonstrates significant efficacy compared to placebo.
|
||||
The safety profile is favorable with minimal adverse reactions.
|
||||
Conclusion
|
||||
Drug X is recommended for treatment of hypertension in adult patients.
|
||||
Further studies should explore long-term effects and optimal dosing.""",
|
||||
'metadata': {'author': 'Medical Research Institute', 'creator': 'LaTeX'},
|
||||
},
|
||||
{
|
||||
'name': 'multi-page-academic',
|
||||
'title': 'Multi-Page Academic Paper',
|
||||
'pages': [
|
||||
"""Abstract
|
||||
This paper presents a comprehensive study of distributed systems.
|
||||
Page 1 of 3""",
|
||||
"""Introduction
|
||||
Distributed systems form the backbone of modern cloud computing.
|
||||
We explore consistency models and their practical implications.
|
||||
Page 2 of 3""",
|
||||
"""Conclusion
|
||||
Our findings suggest new approaches to system design.
|
||||
Future work will address scalability challenges.
|
||||
Page 3 of 3""",
|
||||
],
|
||||
'metadata': {'author': 'Dr. Smith', 'creator': 'LaTeX'},
|
||||
},
|
||||
{
|
||||
'name': 'code-documentation',
|
||||
'title': 'Code Library Documentation',
|
||||
'content': """libpdf - PDF Processing Library
|
||||
Installation
|
||||
pip install libpdf
|
||||
Quick Example
|
||||
from libpdf import Document
|
||||
doc = Document('example.pdf')
|
||||
text = doc.extract_text()
|
||||
API Reference
|
||||
Document.open(path)
|
||||
Opens a PDF file for reading.
|
||||
Document.extract_text()
|
||||
Extracts all text content from the document.
|
||||
Document.get_page_count()
|
||||
Returns the number of pages in the document.
|
||||
Supported Formats
|
||||
PDF 1.0 through PDF 2.0
|
||||
Encrypted PDFs (with password)
|
||||
Forms and annotations
|
||||
Limitations
|
||||
OCR requires additional dependencies.
|
||||
Very large files may require streaming mode.
|
||||
License
|
||||
MIT License - see LICENSE file for details.""",
|
||||
'metadata': {'author': 'Open Source Contributors', 'creator': 'Markdown'},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
"""Generate all vector CER corpus fixtures."""
|
||||
print("Generating vector CER corpus fixtures...")
|
||||
print(f"Target directory: {FIXTURE_DIR}")
|
||||
|
||||
for fixture in FIXTURES:
|
||||
name = fixture['name']
|
||||
title = fixture['title']
|
||||
metadata = fixture.get('metadata', {})
|
||||
|
||||
# Create fixture subdirectory
|
||||
fixture_dir = os.path.join(FIXTURE_DIR, name)
|
||||
os.makedirs(fixture_dir, exist_ok=True)
|
||||
|
||||
# Create PDF
|
||||
pdf_path = os.path.join(fixture_dir, 'source.pdf')
|
||||
if 'pages' in fixture:
|
||||
# Multi-page PDF
|
||||
create_multi_page_text_pdf(pdf_path, title, fixture['pages'], metadata)
|
||||
else:
|
||||
# Single-page PDF
|
||||
create_text_pdf(pdf_path, title, fixture['content'], metadata)
|
||||
|
||||
# Create ground truth text file
|
||||
gt_path = os.path.join(fixture_dir, 'ground_truth.txt')
|
||||
if 'pages' in fixture:
|
||||
gt_content = '\n\n'.join(fixture['pages'])
|
||||
else:
|
||||
gt_content = fixture['content']
|
||||
|
||||
with open(gt_path, 'w', encoding='utf-8') as f:
|
||||
f.write(gt_content)
|
||||
|
||||
# Create README
|
||||
readme_path = os.path.join(fixture_dir, 'README.md')
|
||||
with open(readme_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"""# {title} - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
{gt_content[:200]}...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: {title}
|
||||
- Author: {metadata.get('author', 'N/A')}
|
||||
- Creator: {metadata.get('creator', 'N/A')}
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
""")
|
||||
|
||||
print(f" Created {name}/")
|
||||
|
||||
print(f"\nGenerated {len(FIXTURES)} fixtures successfully!")
|
||||
print("\nTo verify CER with pdftract:")
|
||||
print(" for f in tests/fixtures/vector/*/source.pdf; do")
|
||||
print(" pdftract extract \"$f\" --json /dev/null")
|
||||
print(" done")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
26
tests/fixtures/vector/legal-contract/README.md
vendored
Normal file
26
tests/fixtures/vector/legal-contract/README.md
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Service Agreement - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
SERVICE AGREEMENT
|
||||
This Service Agreement is entered into as of January 1, 2024.
|
||||
1. Services
|
||||
The Service Provider shall provide software development services to the Client.
|
||||
2. Term
|
||||
This agreement shall...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Service Agreement
|
||||
- Author: Legal Department
|
||||
- Creator: Word
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
15
tests/fixtures/vector/legal-contract/ground_truth.txt
vendored
Normal file
15
tests/fixtures/vector/legal-contract/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
SERVICE AGREEMENT
|
||||
This Service Agreement is entered into as of January 1, 2024.
|
||||
1. Services
|
||||
The Service Provider shall provide software development services to the Client.
|
||||
2. Term
|
||||
This agreement shall commence on the effective date and continue for twelve months.
|
||||
3. Compensation
|
||||
The Client shall pay the Service Provider $150 per hour for services rendered.
|
||||
Invoices shall be submitted monthly and are due within 30 days.
|
||||
4. Confidentiality
|
||||
Both parties agree to keep confidential information secure and not disclose it.
|
||||
5. Termination
|
||||
Either party may terminate this agreement with 30 days written notice.
|
||||
6. Governing Law
|
||||
This agreement shall be governed by the laws of the State of California.
|
||||
BIN
tests/fixtures/vector/legal-contract/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/legal-contract/source.pdf
vendored
Normal file
Binary file not shown.
25
tests/fixtures/vector/medical-research/README.md
vendored
Normal file
25
tests/fixtures/vector/medical-research/README.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Clinical Trial Results - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Clinical Trial: Drug Efficacy Study
|
||||
Background
|
||||
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
|
||||
Methodology
|
||||
500 patients were randomized into treatment and placebo g...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Clinical Trial Results
|
||||
- Author: Medical Research Institute
|
||||
- Creator: LaTeX
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
15
tests/fixtures/vector/medical-research/ground_truth.txt
vendored
Normal file
15
tests/fixtures/vector/medical-research/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
Clinical Trial: Drug Efficacy Study
|
||||
Background
|
||||
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
|
||||
Methodology
|
||||
500 patients were randomized into treatment and placebo groups.
|
||||
The study duration was 24 weeks with regular monitoring.
|
||||
Results
|
||||
The treatment group showed a 25% greater reduction in systolic blood pressure.
|
||||
Side effects were mild and reported in less than 5% of patients.
|
||||
Discussion
|
||||
Drug X demonstrates significant efficacy compared to placebo.
|
||||
The safety profile is favorable with minimal adverse reactions.
|
||||
Conclusion
|
||||
Drug X is recommended for treatment of hypertension in adult patients.
|
||||
Further studies should explore long-term effects and optimal dosing.
|
||||
BIN
tests/fixtures/vector/medical-research/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/medical-research/source.pdf
vendored
Normal file
Binary file not shown.
27
tests/fixtures/vector/multi-page-academic/README.md
vendored
Normal file
27
tests/fixtures/vector/multi-page-academic/README.md
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# Multi-Page Academic Paper - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Abstract
|
||||
This paper presents a comprehensive study of distributed systems.
|
||||
Page 1 of 3
|
||||
|
||||
Introduction
|
||||
Distributed systems form the backbone of modern cloud computing.
|
||||
We explore consistency models and ...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Multi-Page Academic Paper
|
||||
- Author: Dr. Smith
|
||||
- Creator: LaTeX
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
13
tests/fixtures/vector/multi-page-academic/ground_truth.txt
vendored
Normal file
13
tests/fixtures/vector/multi-page-academic/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
Abstract
|
||||
This paper presents a comprehensive study of distributed systems.
|
||||
Page 1 of 3
|
||||
|
||||
Introduction
|
||||
Distributed systems form the backbone of modern cloud computing.
|
||||
We explore consistency models and their practical implications.
|
||||
Page 2 of 3
|
||||
|
||||
Conclusion
|
||||
Our findings suggest new approaches to system design.
|
||||
Future work will address scalability challenges.
|
||||
Page 3 of 3
|
||||
BIN
tests/fixtures/vector/multi-page-academic/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/multi-page-academic/source.pdf
vendored
Normal file
Binary file not shown.
25
tests/fixtures/vector/scientific-report/README.md
vendored
Normal file
25
tests/fixtures/vector/scientific-report/README.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Climate Research Report - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Executive Summary
|
||||
This report analyzes climate data collected from 50 monitoring stations.
|
||||
Key findings indicate a 1.2 degree Celsius increase over the past decade.
|
||||
Data Collection
|
||||
Temperature reading...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Climate Research Report
|
||||
- Author: Research Team
|
||||
- Creator: LaTeX
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
15
tests/fixtures/vector/scientific-report/ground_truth.txt
vendored
Normal file
15
tests/fixtures/vector/scientific-report/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
Executive Summary
|
||||
This report analyzes climate data collected from 50 monitoring stations.
|
||||
Key findings indicate a 1.2 degree Celsius increase over the past decade.
|
||||
Data Collection
|
||||
Temperature readings were recorded hourly from January to December 2023.
|
||||
The monitoring stations are located across diverse geographic regions.
|
||||
Analysis
|
||||
Linear regression was applied to identify temperature trends.
|
||||
Confidence intervals were calculated at the 95% level.
|
||||
Findings
|
||||
The data shows consistent warming across all monitoring stations.
|
||||
Urban areas show higher temperature increases compared to rural locations.
|
||||
Recommendations
|
||||
We recommend continued monitoring and expanded data collection efforts.
|
||||
Immediate action should be taken to reduce carbon emissions.
|
||||
63
tests/fixtures/vector/scientific-report/source.pdf
vendored
Normal file
63
tests/fixtures/vector/scientific-report/source.pdf
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/Title (Climate Research Report)
|
||||
/Author (Research Team)
|
||||
/Creator (LaTeX)
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 612 792]
|
||||
/Contents 4 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length 444
|
||||
>>
|
||||
stream
|
||||
xœmRK<EFBFBD>Ó0¾ûWÌ…kØGèîJìr‚ ÎS{’ø<11>ív˯g©‡=DV<ö÷ô—ѼÚÁnãdîïà£~£3<C2A3>¯dkáÁ<>ÊÅŒgZ“Àˆþò—2XÏ<0B>Â`“÷d9˜$P <50>"—$gȧ˜óB˜8:ÝÍÐVÛvÃÍBò™k›Z!ÌéDe!X1=eÑÑ`롳*¶)¬$Xª<58>*ÅÎp¦íÏ&qªlIUü¥|ÆXÕ”d)•c·ÿ0¨WzK; BùÔô:@+)gpš‘¨Â™Ò,¸.l•lîN?·˜2gó<67>#¡´<C2A1>PÎ:„3*ÚºzV$ågG±ð¤Zn,¡èæ<>âÔXÒ@
|
||||
É ýÕ–Eo«ïrʖϧûwàéD~0O׌77[AyIgíLµq.J¨*$4ƒW/èýÛ•ý”#Ææ^U7Xx^Z#7jÿwÕªg7cR}ϬC}×&BPc}Ãüêål[MZáXõê<C3B5>Ô½®ºèþíCk9Ò4éƒTܯŠàx{I}¢:«wpÔñÅM¹j[jrlWo]äÆßæqüsÕ#
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000098 00000 n
|
||||
0000000173 00000 n
|
||||
0000000803 00000 n
|
||||
0000000820 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 6
|
||||
/Root 1 0 R
|
||||
>>
|
||||
startxref
|
||||
1048
|
||||
%%EOF
|
||||
25
tests/fixtures/vector/technical-documentation/README.md
vendored
Normal file
25
tests/fixtures/vector/technical-documentation/README.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# API Documentation - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Getting Started
|
||||
To use the API, first obtain an authentication token from the dashboard.
|
||||
Include this token in the Authorization header of all requests.
|
||||
Authentication
|
||||
All API requests require authent...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: API Documentation
|
||||
- Author: API Team
|
||||
- Creator: Word
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
15
tests/fixtures/vector/technical-documentation/ground_truth.txt
vendored
Normal file
15
tests/fixtures/vector/technical-documentation/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
Getting Started
|
||||
To use the API, first obtain an authentication token from the dashboard.
|
||||
Include this token in the Authorization header of all requests.
|
||||
Authentication
|
||||
All API requests require authentication using a Bearer token.
|
||||
Tokens expire after 24 hours and must be refreshed.
|
||||
Endpoints
|
||||
GET /api/users - Retrieve a list of users
|
||||
POST /api/users - Create a new user
|
||||
GET /api/users/:id - Retrieve a specific user
|
||||
PUT /api/users/:id - Update a user
|
||||
DELETE /api/users/:id - Delete a user
|
||||
Rate Limits
|
||||
The API has a rate limit of 1000 requests per hour per user.
|
||||
Exceeding this limit will result in a 429 Too Many Requests response.
|
||||
63
tests/fixtures/vector/technical-documentation/source.pdf
vendored
Normal file
63
tests/fixtures/vector/technical-documentation/source.pdf
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/Title (API Documentation)
|
||||
/Author (API Team)
|
||||
/Creator (Word)
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 612 792]
|
||||
/Contents 4 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length 368
|
||||
>>
|
||||
stream
|
||||
xœm‘áNÂ0…ÿ÷)îècôÈ$$ ”(ë<>»:ÚÙv‚>½·<C2BD>‘€&[ÖäœûuçÜ©ƒÇ!G +q“Ã-¿R‹9†@æ6A¹€ZH<0B>G5Âdµ¸‚Šœ`wA‘ÅOÇ’ Tª@Ö@°oh rvŸF´òõÎ*§3±0eÓéH"ÿccBâ2Â:úê 5*<2A>lªiÀá{‡>øLLÎ.ù‡~
é@/¨ó1<C3B3>‚)*ÇØtqÆ©øã<01>m©K£1Ô¶sžSiØwœr‡Lú9@atkÉ/æ…„<E280A6>jiÀͰÿÖá“ ¡XOI«çÍ…õÁ¡
|
||||
Ñhð<EFBFBD>L´Á=és¢o±¤ŠÊÞ½ÚþãÞ¶º‡&ˬX²øëšaƒ'×:N,iOHöÛ…ZqzpQi¢ƒó<?µÜrQ±¥tˆîåX"êØrZm?x ´<ß5!®YÁxtÒZxRæ“Óý.Í·ÖxÌ@¾ŠB~-Àá
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000098 00000 n
|
||||
0000000173 00000 n
|
||||
0000000715 00000 n
|
||||
0000000732 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 6
|
||||
/Root 1 0 R
|
||||
>>
|
||||
startxref
|
||||
960
|
||||
%%EOF
|
||||
25
tests/fixtures/vector/user-manual/README.md
vendored
Normal file
25
tests/fixtures/vector/user-manual/README.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Product User Manual - CER Test Fixture
|
||||
|
||||
## Purpose
|
||||
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
|
||||
|
||||
## Files
|
||||
- `source.pdf` - Clean vector PDF with embedded text
|
||||
- `ground_truth.txt` - Exact text content for CER comparison
|
||||
- `README.md` - This file
|
||||
|
||||
## Content
|
||||
Quick Start Guide
|
||||
Thank you for purchasing our product. This guide will help you get started.
|
||||
Unboxing
|
||||
Carefully remove the product from the packaging.
|
||||
Check that all items listed on the included card...
|
||||
|
||||
## Expected CER
|
||||
Target: < 0.5% character error rate when extracted by pdftract.
|
||||
|
||||
## Metadata
|
||||
- Title: Product User Manual
|
||||
- Author: Product Team
|
||||
- Creator: Word
|
||||
- Generated by: generate_vector_cer_corpus.py
|
||||
19
tests/fixtures/vector/user-manual/ground_truth.txt
vendored
Normal file
19
tests/fixtures/vector/user-manual/ground_truth.txt
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
Quick Start Guide
|
||||
Thank you for purchasing our product. This guide will help you get started.
|
||||
Unboxing
|
||||
Carefully remove the product from the packaging.
|
||||
Check that all items listed on the included card are present.
|
||||
Setup
|
||||
1. Connect the power adapter to a wall outlet.
|
||||
2. Press and hold the power button for 3 seconds.
|
||||
3. Follow the on-screen instructions to complete setup.
|
||||
Features
|
||||
- Wireless connectivity
|
||||
- Touch screen interface
|
||||
- Long battery life
|
||||
- Compact design
|
||||
Troubleshooting
|
||||
If the device does not turn on, ensure the battery is charged.
|
||||
For connection issues, restart your router and try again.
|
||||
Support
|
||||
For additional help, visit support.example.com or call 1-800-SUPPORT.
|
||||
BIN
tests/fixtures/vector/user-manual/source.pdf
vendored
Normal file
BIN
tests/fixtures/vector/user-manual/source.pdf
vendored
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue