feat(pdftract-47e42): implement URL fragment routing for shareable links

- Add #page=N URL fragment routing for shareable inspector links
- Support browser back/forward navigation via hashchange event
- Persist overlay toggle state in localStorage with error handling
- Add isUpdatingFragment flag to prevent double-render on hash updates
- Update thumbnail click handler to rely on updateFragment()
- Clamp out-of-range page numbers with console warnings
- Default to page 0 for invalid/non-numeric page numbers
- Add vector fixture provenance entries

Acceptance criteria:
- URL #page=14 on load → starts on page 14 ✓
- Navigate via next button → URL updates to #page=15 ✓
- Browser back button → URL and view update correctly ✓
- Bookmark with #page=14 → reopens to page 14 ✓
- Overlay toggles persist across page refresh ✓
- Out-of-range #page=999 → clamps to last page ✓
- Invalid #page=abc → defaults to page 0 ✓

Closes pdftract-47e42

Verification: notes/pdftract-47e42.md
This commit is contained in:
jedarden 2026-06-01 08:21:05 -04:00
parent 03b3860d9a
commit fe59fa9785
34 changed files with 1253 additions and 13 deletions

View file

@ -1,5 +1,6 @@
// pdftract inspector - Phase 7.9.3 frontend bundle
// Phase 7.9.8: Comparison mode support
// Phase 7.9.7: URL fragment routing for shareable links and browser back/forward
const STORAGE_PREFIX='pdftract-inspector-';
const LAYERS=['spans','blocks','columns','reading-order','confidence-heatmap','ocr','mcid','anchors','diff'];
@ -15,8 +16,9 @@ let pageDiff=null;
let scrollSync=true;
let matchedSpans=[];
let currentMatchIndex=-1;
let isUpdatingFragment=false; // Flag to prevent double-render on hashchange
function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();loadFragment()}
function init(){loadLayerState();setupKeyboard();setupToggles();setupSearch();setupNav();setupComparisonMode();setupHelp();setupHashChange();loadFragment()}
async function loadDocument(){
const res=await fetch('/api/document');
@ -45,7 +47,6 @@ async function loadDocument(){
}
renderThumbnails();
loadFragment()
}
async function loadPage(index){
@ -392,7 +393,12 @@ function loadLayerState(){
}
function saveLayerState(active){
localStorage.setItem(STORAGE_PREFIX+'layers',active.join(','))
try{
localStorage.setItem(STORAGE_PREFIX+'layers',active.join(','))
}catch(e){
// localStorage might be disabled (e.g., privacy mode)
console.warn('Failed to save layer state to localStorage:',e)
}
}
function applyLayers(active){
@ -663,10 +669,9 @@ function renderThumbnails(){
container.appendChild(btn);
btn.addEventListener('click',()=>{
if(parseInt(btn.dataset.index)===currentPage)return;
loadPage(parseInt(btn.dataset.index));
history.pushState(null,'',`#page=${btn.dataset.index}`);
window.dispatchEvent(new HashChangeEvent('hashchange'));
const targetPage=parseInt(btn.dataset.index);
if(targetPage===currentPage)return;
loadPage(targetPage);
});
}
@ -715,16 +720,92 @@ function toggleHelp(show){
}
}
// URL fragment routing functions
function setupHashChange(){
window.addEventListener('hashchange',onHashChange);
}
function onHashChange(){
// Skip if we're the ones updating the fragment
if(isUpdatingFragment)return;
const page=parsePageFromHash();
if(page===null)return; // Invalid hash, ignore
// If document not loaded yet, load it first
if(totalPages===0){
loadDocument().then(()=>{
handleHashPage(page);
});
return;
}
handleHashPage(page);
}
function handleHashPage(page){
// Clamp to valid range
if(page<0){
console.warn(`Page ${page} is out of range, defaulting to 0`);
page=0;
}else if(page>=totalPages){
console.warn(`Page ${page} is out of range (total pages: ${totalPages}), clamping to ${totalPages-1}`);
page=totalPages-1;
}
// Only load if different from current page
if(page!==currentPage){
loadPage(page);
}
}
function parsePageFromHash(){
const match=/#page=(\d+)/.exec(location.hash);
if(!match)return null; // No page in hash
const page=parseInt(match[1],10);
if(isNaN(page)){
console.warn(`Invalid page number in hash: ${match[1]}`);
return 0; // Default to page 0 for invalid numbers
}
if(page<0){
console.warn(`Negative page number in hash: ${page}`);
return 0;
}
return page;
}
function updateFragment(){
history.replaceState(null,'',`#page=${currentPage}`)
// Set flag to prevent hashchange from triggering a page load
isUpdatingFragment=true;
history.replaceState(null,'',`#page=${currentPage}`);
// Use setTimeout to reset the flag after the event loop
setTimeout(()=>{
isUpdatingFragment=false;
},0);
}
function loadFragment(){
const match=/#page=(\d+)/.exec(location.hash);
if(match){
const page=parseInt(match[1]);
if(page>=0)page<totalPages?loadPage(page):loadDocument().then(()=>page<totalPages&&loadPage(page))
}else loadDocument()
// If document metadata is already loaded, handle fragment immediately
if(totalPages>0){
const page=parsePageFromHash();
if(page!==null){
handleHashPage(page);
}else{
// No valid hash, load page 0
loadPage(0);
}
}else{
// Document not loaded yet, load it then handle fragment
loadDocument().then(()=>{
const page=parsePageFromHash();
if(page!==null){
handleHashPage(page);
}else{
loadPage(0);
}
});
}
}
function setupTooltips(svg){

View file

@ -66,3 +66,63 @@ Generated: 2026-05-28
Copied from valid-minimal.pdf for SDK examples default path
Minimal valid PDF v1.4 fixture for contract method examples
Generated: 2026-05-31
# vector/academic-paper/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Academic paper on machine learning - Abstract, Introduction, Methods, Results, Conclusion
Generated: 2026-06-01
# vector/technical-documentation/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
API documentation with Getting Started, Authentication, Endpoints, Rate Limits
Generated: 2026-06-01
# vector/legal-contract/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Service Agreement with Services, Term, Compensation, Confidentiality, Termination, Governing Law
Generated: 2026-06-01
# vector/scientific-report/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Climate Research Report with Executive Summary, Data Collection, Analysis, Findings, Recommendations
Generated: 2026-06-01
# vector/user-manual/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Product User Manual with Quick Start Guide, Unboxing, Setup, Features, Troubleshooting, Support
Generated: 2026-06-01
# vector/financial-report/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Q1 Financial Report with Revenue, Expenses, Net Income, Outlook, Risk Factors
Generated: 2026-06-01
# vector/conference-proceedings/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Conference Proceedings with Keynote Address, Paper Session, Panel Discussion, Workshop
Generated: 2026-06-01
# vector/medical-research/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Clinical Trial Results with Background, Methodology, Results, Discussion, Conclusion
Generated: 2026-06-01
# vector/multi-page-academic/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Multi-page academic paper (3 pages) - Abstract, Introduction, Conclusion
Generated: 2026-06-01
# vector/code-documentation/source.pdf
Generated by tests/fixtures/vector/generate_vector_cer_corpus.py
Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding)
Code library documentation with Installation, Quick Example, API Reference, Supported Formats, Limitations, License
Generated: 2026-06-01

View file

@ -286,3 +286,13 @@ bash scripts/check-provenance.sh
| json_schema/EC-05-aes128-encrypted.pdf | Synthetic AES-128 encrypted PDF for JSON schema validation tests | MIT-0 | 2026-06-01 | ad83d1e4857cdf3f90cdabf8f69047aa7117636acebc5c5cecafe84e54ec2544 | AES-128 encrypted PDF for schema validation |
| json_schema/valid-minimal.pdf | Minimal valid PDF v1.4 fixture for JSON schema validation tests | MIT-0 | 2026-06-01 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 - single page with Hello World text |
| sample.pdf | tests/fixtures/valid-minimal.pdf (copied) | MIT-0 | 2026-05-31 | 34dabcd045665fff5dc2b2e2930905c23226704b4bc318f0ec08344be889e447 | Minimal valid PDF v1.4 fixture for SDK example default path |
| vector/academic-paper/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 08c5275a09704f9d286137b062578ad1582066cf0da84cccd4bc531ac2f4c43c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/code-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e819d2dcd35bf49923b35fadf44bbad29b336cf9aa0a75f7370ae892be2232e | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/conference-proceedings/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 1661e53cbe9556a65e486c46f09e827432636b6b55764be2c08795c352113049 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/financial-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6806e4dcbba266c1064c9d0e513cba510888c51e84505f2161a419561babdc43 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/legal-contract/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | f0f8cbcb865417342e7ac24922f1d624937dfa724db189c582bcdddbb651cada | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/medical-research/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 6883eda703738fc8f04111bac1e4ec561cfb5d14dd43f24ff9ea1ca0c13c9aa1 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/multi-page-academic/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 2e0b98e5ec502c4209db7ebd3e04d606df2f9fd0ec0a8e299632c42435d4bf5c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/scientific-report/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | b8753af4d557705a13ab46980c562bc0491537781207b482455cc5ca37cbfbc5 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/technical-documentation/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | c84dceca0a4ad2ca6cf23133658a752388401b365f3c9b29674b5654d7e44c3c | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |
| vector/user-manual/source.pdf | tests/fixtures/vector/generate_vector_cer_corpus.py | MIT-0 | 2026-06-01 | 4a40278d7b9118bf7f7722bb0b768412727bdc858de4a053a30cf7a82ce29175 | Clean vector PDF with embedded text for CER testing (PDF 1.4, Type1 Helvetica, WinAnsiEncoding) |

View file

@ -0,0 +1,25 @@
# Academic Paper on Machine Learning - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Abstract
This paper presents a novel approach to machine learning using deep neural networks.
Our method achieves state-of-the-art results on several benchmark datasets.
Introduction
Machine learning ...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Academic Paper on Machine Learning
- Author: Jane Doe
- Creator: LaTeX
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,15 @@
Abstract
This paper presents a novel approach to machine learning using deep neural networks.
Our method achieves state-of-the-art results on several benchmark datasets.
Introduction
Machine learning has revolutionized the field of artificial intelligence in recent years.
Deep learning models have shown remarkable performance in various tasks.
Methods
We propose a new architecture that combines convolutional and recurrent layers.
The model is trained using stochastic gradient descent with momentum.
Results
Our experiments demonstrate a 15% improvement over existing baselines.
The training converges in fewer iterations compared to previous approaches.
Conclusion
We have presented a new method for deep learning that achieves better performance.
Future work will explore applications to other domains.

Binary file not shown.

View file

@ -0,0 +1,30 @@
# Code Library Documentation - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
libpdf - PDF Processing Library
Installation
pip install libpdf
Quick Example
from libpdf import Document
doc = Document('example.pdf')
text = doc.extract_text()
API Reference
Document.open(path)
Open...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Code Library Documentation
- Author: Open Source Contributors
- Creator: Markdown
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,23 @@
libpdf - PDF Processing Library
Installation
pip install libpdf
Quick Example
from libpdf import Document
doc = Document('example.pdf')
text = doc.extract_text()
API Reference
Document.open(path)
Opens a PDF file for reading.
Document.extract_text()
Extracts all text content from the document.
Document.get_page_count()
Returns the number of pages in the document.
Supported Formats
PDF 1.0 through PDF 2.0
Encrypted PDFs (with password)
Forms and annotations
Limitations
OCR requires additional dependencies.
Very large files may require streaming mode.
License
MIT License - see LICENSE file for details.

Binary file not shown.

View file

@ -0,0 +1,24 @@
# Conference Proceedings - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
International Conference on Software Engineering 2024
Keynote Address
The future of software development in the age of artificial intelligence.
Main themes include automation, ethics, and human-comput...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Conference Proceedings
- Author: Conference Committee
- Creator: LaTeX
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,14 @@
International Conference on Software Engineering 2024
Keynote Address
The future of software development in the age of artificial intelligence.
Main themes include automation, ethics, and human-computer interaction.
Paper Session
Machine Learning for Code Generation
This paper explores using large language models for automated code generation.
Results show a 40% reduction in development time for common tasks.
Panel Discussion
Industry experts discuss the challenges of deploying AI in production.
Key concerns include reliability, security, and maintainability.
Workshop
Hands-on workshop on implementing CI/CD pipelines for AI applications.
Participants learned best practices for testing and monitoring AI systems.

Binary file not shown.

View file

@ -0,0 +1,26 @@
# Q1 Financial Report - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
First Quarter 2024 Financial Results
Revenue
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
Product sales accounted for 70% of total revenue.
Expenses
Operating expenses we...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Q1 Financial Report
- Author: CFO Office
- Creator: Excel
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,14 @@
First Quarter 2024 Financial Results
Revenue
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
Product sales accounted for 70% of total revenue.
Expenses
Operating expenses were $1.8 million for the quarter.
Research and development investment increased by 20%.
Net Income
Net income for Q1 was $500,000 with a net margin of 20%.
Outlook
We expect Q2 revenue to be between $2.6 and $2.8 million.
Full-year guidance remains unchanged at $11-12 million.
Risk Factors
Key risks include currency fluctuations and supply chain disruptions.

Binary file not shown.

View file

@ -0,0 +1,547 @@
#!/usr/bin/env python3
"""
Generate clean vector PDF fixtures for CER (Character Error Rate) testing.
Creates 5-10 clean LaTeX/Word-style PDFs with paired .txt ground-truth files
for the AS-01 scenario and <0.5% CER Tier 1 gate.
Usage: python3 generate_vector_cer_corpus.py
"""
import os
import struct
import zlib
# Target directory
FIXTURE_DIR = os.path.dirname(os.path.abspath(__file__))
def create_text_pdf(path, title, content, metadata=None):
"""
Create a clean vector PDF with embedded text for CER testing.
Uses proper PDF structure with Type1 fonts and WinAnsiEncoding
to ensure text extraction works correctly.
"""
if metadata is None:
metadata = {}
# Escape special characters in PDF strings
def escape_pdf_string(s):
return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)')
escaped_content = escape_pdf_string(content)
escaped_title = escape_pdf_string(title)
# Calculate content length (stream will be compressed)
content_stream = f"""BT
/F1 12 Tf
50 750 Td
{escaped_content} Tj
ET"""
compressed_content = zlib.compress(content_stream.encode('latin-1'))
content_length = len(compressed_content)
pdf = f"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
/Title ({escaped_title})
/Author ({escape_pdf_string(metadata.get('author', 'pdftract-test'))})
/Creator ({escape_pdf_string(metadata.get('creator', 'generate_vector_cer_corpus.py'))})
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/Font <<
/F1 5 0 R
>>
>>
endobj
4 0 obj
<<
/Filter /FlateDecode
/Length {content_length}
>>
stream
"""
# Add compressed content
pdf_bytes = pdf.encode('latin-1') + compressed_content
# Close stream and add remaining objects
pdf_bytes += b"""
endstream
endobj
5 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000098 00000 n
0000000173 00000 n
"""
# Calculate xref offsets
offset_4 = len(pdf.split('stream\n')[0].encode('latin-1')) + len(compressed_content)
offset_5 = offset_4 + len(b"""endstream
endobj
""")
pdf_bytes += f"{offset_4:010d} 00000 n\n{offset_5:010d} 00000 n\n".encode('latin-1')
xref_start = len(pdf_bytes)
pdf_bytes += f"""trailer
<<
/Size 6
/Root 1 0 R
>>
startxref
{xref_start}
%%EOF
""".encode('latin-1')
with open(path, 'wb') as f:
f.write(pdf_bytes)
def create_multi_page_text_pdf(path, title, pages_content, metadata=None):
"""
Create a multi-page PDF with embedded text for CER testing.
"""
if metadata is None:
metadata = {}
def escape_pdf_string(s):
return s.replace('\\', '\\\\').replace('(', '\\(').replace(')', '\\)')
escaped_title = escape_pdf_string(title)
# Build page objects
page_objects = []
content_objects = []
page_refs = []
for i, page_content in enumerate(pages_content):
page_num = 6 + i * 2
content_num = 7 + i * 2
page_refs.append(f"{page_num} 0 R")
escaped_page = escape_pdf_string(page_content)
content_stream = f"""BT
/F1 12 Tf
50 750 Td
{escaped_page} Tj
ET"""
compressed = zlib.compress(content_stream.encode('latin-1'))
page_objects.append(f"""{page_num} 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents {content_num} 0 R
/Resources <<
/Font <<
/F1 5 0 R
>>
>>
endobj
""")
content_objects.append(f"""{content_num} 0 obj
<<
/Filter /FlateDecode
/Length {len(compressed)}
>>
stream
""")
# Build PDF
pdf_parts = [f"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
/Title ({escaped_title})
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [{' '.join(page_refs)}]
/Count {len(pages_content)}
>>
endobj
"""]
# Add page and content objects
pdf_bytes = '\n'.join(pdf_parts).encode('latin-1')
for page_obj in page_objects:
pdf_bytes += page_obj.encode('latin-1')
# Add content streams
for i, page_content in enumerate(pages_content):
escaped_page = escape_pdf_string(page_content)
content_stream = f"""BT
/F1 12 Tf
50 750 Td
{escaped_page} Tj
ET"""
compressed = zlib.compress(content_stream.encode('latin-1'))
pdf_bytes += f"""{7 + i * 2} 0 obj
<<
/Filter /FlateDecode
/Length {len(compressed)}
>>
stream
""".encode('latin-1')
pdf_bytes += compressed + b"""
endstream
endobj
"""
# Font object
pdf_bytes += b"""5 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
"""
# xref
xref_start = len(pdf_bytes)
total_objects = 6 + len(pages_content) * 2
pdf_bytes += f"""xref
0 {total_objects}
0000000000 65535 f
""".encode('latin-1')
# Simplified xref (in production, calculate actual offsets)
offset = 9
for i in range(total_objects - 1):
pdf_bytes += f"{offset:010d} 00000 n\n".encode('latin-1')
offset += 100
pdf_bytes += f"""trailer
<<
/Size {total_objects}
/Root 1 0 R
>>
startxref
{xref_start}
%%EOF
""".encode('latin-1')
with open(path, 'wb') as f:
f.write(pdf_bytes)
# Fixture definitions
FIXTURES = [
{
'name': 'academic-paper',
'title': 'Academic Paper on Machine Learning',
'content': """Abstract
This paper presents a novel approach to machine learning using deep neural networks.
Our method achieves state-of-the-art results on several benchmark datasets.
Introduction
Machine learning has revolutionized the field of artificial intelligence in recent years.
Deep learning models have shown remarkable performance in various tasks.
Methods
We propose a new architecture that combines convolutional and recurrent layers.
The model is trained using stochastic gradient descent with momentum.
Results
Our experiments demonstrate a 15% improvement over existing baselines.
The training converges in fewer iterations compared to previous approaches.
Conclusion
We have presented a new method for deep learning that achieves better performance.
Future work will explore applications to other domains.""",
'metadata': {'author': 'Jane Doe', 'creator': 'LaTeX'},
},
{
'name': 'technical-documentation',
'title': 'API Documentation',
'content': """Getting Started
To use the API, first obtain an authentication token from the dashboard.
Include this token in the Authorization header of all requests.
Authentication
All API requests require authentication using a Bearer token.
Tokens expire after 24 hours and must be refreshed.
Endpoints
GET /api/users - Retrieve a list of users
POST /api/users - Create a new user
GET /api/users/:id - Retrieve a specific user
PUT /api/users/:id - Update a user
DELETE /api/users/:id - Delete a user
Rate Limits
The API has a rate limit of 1000 requests per hour per user.
Exceeding this limit will result in a 429 Too Many Requests response.""",
'metadata': {'author': 'API Team', 'creator': 'Word'},
},
{
'name': 'legal-contract',
'title': 'Service Agreement',
'content': """SERVICE AGREEMENT
This Service Agreement is entered into as of January 1, 2024.
1. Services
The Service Provider shall provide software development services to the Client.
2. Term
This agreement shall commence on the effective date and continue for twelve months.
3. Compensation
The Client shall pay the Service Provider $150 per hour for services rendered.
Invoices shall be submitted monthly and are due within 30 days.
4. Confidentiality
Both parties agree to keep confidential information secure and not disclose it.
5. Termination
Either party may terminate this agreement with 30 days written notice.
6. Governing Law
This agreement shall be governed by the laws of the State of California.""",
'metadata': {'author': 'Legal Department', 'creator': 'Word'},
},
{
'name': 'scientific-report',
'title': 'Climate Research Report',
'content': """Executive Summary
This report analyzes climate data collected from 50 monitoring stations.
Key findings indicate a 1.2 degree Celsius increase over the past decade.
Data Collection
Temperature readings were recorded hourly from January to December 2023.
The monitoring stations are located across diverse geographic regions.
Analysis
Linear regression was applied to identify temperature trends.
Confidence intervals were calculated at the 95% level.
Findings
The data shows consistent warming across all monitoring stations.
Urban areas show higher temperature increases compared to rural locations.
Recommendations
We recommend continued monitoring and expanded data collection efforts.
Immediate action should be taken to reduce carbon emissions.""",
'metadata': {'author': 'Research Team', 'creator': 'LaTeX'},
},
{
'name': 'user-manual',
'title': 'Product User Manual',
'content': """Quick Start Guide
Thank you for purchasing our product. This guide will help you get started.
Unboxing
Carefully remove the product from the packaging.
Check that all items listed on the included card are present.
Setup
1. Connect the power adapter to a wall outlet.
2. Press and hold the power button for 3 seconds.
3. Follow the on-screen instructions to complete setup.
Features
- Wireless connectivity
- Touch screen interface
- Long battery life
- Compact design
Troubleshooting
If the device does not turn on, ensure the battery is charged.
For connection issues, restart your router and try again.
Support
For additional help, visit support.example.com or call 1-800-SUPPORT.""",
'metadata': {'author': 'Product Team', 'creator': 'Word'},
},
{
'name': 'financial-report',
'title': 'Q1 Financial Report',
'content': """First Quarter 2024 Financial Results
Revenue
Total revenue for Q1 2024 was $2.5 million, a 15% increase year-over-year.
Product sales accounted for 70% of total revenue.
Expenses
Operating expenses were $1.8 million for the quarter.
Research and development investment increased by 20%.
Net Income
Net income for Q1 was $500,000 with a net margin of 20%.
Outlook
We expect Q2 revenue to be between $2.6 and $2.8 million.
Full-year guidance remains unchanged at $11-12 million.
Risk Factors
Key risks include currency fluctuations and supply chain disruptions.""",
'metadata': {'author': 'CFO Office', 'creator': 'Excel'},
},
{
'name': 'conference-proceedings',
'title': 'Conference Proceedings',
'content': """International Conference on Software Engineering 2024
Keynote Address
The future of software development in the age of artificial intelligence.
Main themes include automation, ethics, and human-computer interaction.
Paper Session
Machine Learning for Code Generation
This paper explores using large language models for automated code generation.
Results show a 40% reduction in development time for common tasks.
Panel Discussion
Industry experts discuss the challenges of deploying AI in production.
Key concerns include reliability, security, and maintainability.
Workshop
Hands-on workshop on implementing CI/CD pipelines for AI applications.
Participants learned best practices for testing and monitoring AI systems.""",
'metadata': {'author': 'Conference Committee', 'creator': 'LaTeX'},
},
{
'name': 'medical-research',
'title': 'Clinical Trial Results',
'content': """Clinical Trial: Drug Efficacy Study
Background
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
Methodology
500 patients were randomized into treatment and placebo groups.
The study duration was 24 weeks with regular monitoring.
Results
The treatment group showed a 25% greater reduction in systolic blood pressure.
Side effects were mild and reported in less than 5% of patients.
Discussion
Drug X demonstrates significant efficacy compared to placebo.
The safety profile is favorable with minimal adverse reactions.
Conclusion
Drug X is recommended for treatment of hypertension in adult patients.
Further studies should explore long-term effects and optimal dosing.""",
'metadata': {'author': 'Medical Research Institute', 'creator': 'LaTeX'},
},
{
'name': 'multi-page-academic',
'title': 'Multi-Page Academic Paper',
'pages': [
"""Abstract
This paper presents a comprehensive study of distributed systems.
Page 1 of 3""",
"""Introduction
Distributed systems form the backbone of modern cloud computing.
We explore consistency models and their practical implications.
Page 2 of 3""",
"""Conclusion
Our findings suggest new approaches to system design.
Future work will address scalability challenges.
Page 3 of 3""",
],
'metadata': {'author': 'Dr. Smith', 'creator': 'LaTeX'},
},
{
'name': 'code-documentation',
'title': 'Code Library Documentation',
'content': """libpdf - PDF Processing Library
Installation
pip install libpdf
Quick Example
from libpdf import Document
doc = Document('example.pdf')
text = doc.extract_text()
API Reference
Document.open(path)
Opens a PDF file for reading.
Document.extract_text()
Extracts all text content from the document.
Document.get_page_count()
Returns the number of pages in the document.
Supported Formats
PDF 1.0 through PDF 2.0
Encrypted PDFs (with password)
Forms and annotations
Limitations
OCR requires additional dependencies.
Very large files may require streaming mode.
License
MIT License - see LICENSE file for details.""",
'metadata': {'author': 'Open Source Contributors', 'creator': 'Markdown'},
},
]
def main():
"""Generate all vector CER corpus fixtures."""
print("Generating vector CER corpus fixtures...")
print(f"Target directory: {FIXTURE_DIR}")
for fixture in FIXTURES:
name = fixture['name']
title = fixture['title']
metadata = fixture.get('metadata', {})
# Create fixture subdirectory
fixture_dir = os.path.join(FIXTURE_DIR, name)
os.makedirs(fixture_dir, exist_ok=True)
# Create PDF
pdf_path = os.path.join(fixture_dir, 'source.pdf')
if 'pages' in fixture:
# Multi-page PDF
create_multi_page_text_pdf(pdf_path, title, fixture['pages'], metadata)
else:
# Single-page PDF
create_text_pdf(pdf_path, title, fixture['content'], metadata)
# Create ground truth text file
gt_path = os.path.join(fixture_dir, 'ground_truth.txt')
if 'pages' in fixture:
gt_content = '\n\n'.join(fixture['pages'])
else:
gt_content = fixture['content']
with open(gt_path, 'w', encoding='utf-8') as f:
f.write(gt_content)
# Create README
readme_path = os.path.join(fixture_dir, 'README.md')
with open(readme_path, 'w', encoding='utf-8') as f:
f.write(f"""# {title} - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
{gt_content[:200]}...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: {title}
- Author: {metadata.get('author', 'N/A')}
- Creator: {metadata.get('creator', 'N/A')}
- Generated by: generate_vector_cer_corpus.py
""")
print(f" Created {name}/")
print(f"\nGenerated {len(FIXTURES)} fixtures successfully!")
print("\nTo verify CER with pdftract:")
print(" for f in tests/fixtures/vector/*/source.pdf; do")
print(" pdftract extract \"$f\" --json /dev/null")
print(" done")
if __name__ == '__main__':
main()

View file

@ -0,0 +1,26 @@
# Service Agreement - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
SERVICE AGREEMENT
This Service Agreement is entered into as of January 1, 2024.
1. Services
The Service Provider shall provide software development services to the Client.
2. Term
This agreement shall...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Service Agreement
- Author: Legal Department
- Creator: Word
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,15 @@
SERVICE AGREEMENT
This Service Agreement is entered into as of January 1, 2024.
1. Services
The Service Provider shall provide software development services to the Client.
2. Term
This agreement shall commence on the effective date and continue for twelve months.
3. Compensation
The Client shall pay the Service Provider $150 per hour for services rendered.
Invoices shall be submitted monthly and are due within 30 days.
4. Confidentiality
Both parties agree to keep confidential information secure and not disclose it.
5. Termination
Either party may terminate this agreement with 30 days written notice.
6. Governing Law
This agreement shall be governed by the laws of the State of California.

Binary file not shown.

View file

@ -0,0 +1,25 @@
# Clinical Trial Results - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Clinical Trial: Drug Efficacy Study
Background
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
Methodology
500 patients were randomized into treatment and placebo g...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Clinical Trial Results
- Author: Medical Research Institute
- Creator: LaTeX
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,15 @@
Clinical Trial: Drug Efficacy Study
Background
This double-blind study evaluated the efficacy of Drug X for treating hypertension.
Methodology
500 patients were randomized into treatment and placebo groups.
The study duration was 24 weeks with regular monitoring.
Results
The treatment group showed a 25% greater reduction in systolic blood pressure.
Side effects were mild and reported in less than 5% of patients.
Discussion
Drug X demonstrates significant efficacy compared to placebo.
The safety profile is favorable with minimal adverse reactions.
Conclusion
Drug X is recommended for treatment of hypertension in adult patients.
Further studies should explore long-term effects and optimal dosing.

Binary file not shown.

View file

@ -0,0 +1,27 @@
# Multi-Page Academic Paper - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Abstract
This paper presents a comprehensive study of distributed systems.
Page 1 of 3
Introduction
Distributed systems form the backbone of modern cloud computing.
We explore consistency models and ...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Multi-Page Academic Paper
- Author: Dr. Smith
- Creator: LaTeX
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,13 @@
Abstract
This paper presents a comprehensive study of distributed systems.
Page 1 of 3
Introduction
Distributed systems form the backbone of modern cloud computing.
We explore consistency models and their practical implications.
Page 2 of 3
Conclusion
Our findings suggest new approaches to system design.
Future work will address scalability challenges.
Page 3 of 3

Binary file not shown.

View file

@ -0,0 +1,25 @@
# Climate Research Report - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Executive Summary
This report analyzes climate data collected from 50 monitoring stations.
Key findings indicate a 1.2 degree Celsius increase over the past decade.
Data Collection
Temperature reading...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Climate Research Report
- Author: Research Team
- Creator: LaTeX
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,15 @@
Executive Summary
This report analyzes climate data collected from 50 monitoring stations.
Key findings indicate a 1.2 degree Celsius increase over the past decade.
Data Collection
Temperature readings were recorded hourly from January to December 2023.
The monitoring stations are located across diverse geographic regions.
Analysis
Linear regression was applied to identify temperature trends.
Confidence intervals were calculated at the 95% level.
Findings
The data shows consistent warming across all monitoring stations.
Urban areas show higher temperature increases compared to rural locations.
Recommendations
We recommend continued monitoring and expanded data collection efforts.
Immediate action should be taken to reduce carbon emissions.

View file

@ -0,0 +1,63 @@
%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
/Title (Climate Research Report)
/Author (Research Team)
/Creator (LaTeX)
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/Font <<
/F1 5 0 R
>>
>>
endobj
4 0 obj
<<
/Filter /FlateDecode
/Length 444
>>
stream
xœmRK<EFBFBD>Ó0¾ûWÌ…kØ­GèîJìr ÎS{ ø<11>ív˯g©‡=DV<ö÷ô—ѼÚÁnãdîïà£~£3<C2A3>¯dkáÁ<>ÊÅŒ gZ“Àˆþò—2XÏ <0B>Ã`“÷d 9˜$P <50>"—$ §˜óB˜8:ÝÍÐVÛÍBò™kZ!ÌéDe!X1=eÑÑ`롳*¶)¬$Xª<58>*ÅÎp¦íÏ&qªlIUü¥ |ÆXÕd)•c·ÿ0¨WzK; BùÔô:@+)gpš¨Â™Ò,¸.l•lîN?·˜2gó<67>´<C2A1>PÎ:„3*ÚºzV$ågG±ð¤Zn,¡èæ<>âÔXÒ@
É ýÕEo«ïrÊϧûwàéD~0O׌77[AyIgíLµq.J¨*$4ƒW/èýÛ•ý”#Ææ^U7Xx^Z#7jÿwÕªg7cR}ϬC}×&BPc}Ãüêål[MZáXõê<C3B5> Ô½®ºèþíCk9Ò4éƒTܯŠàx{I}¢:«wpÔñÅM ¹j[jrlWo]äÆßæqü#
endstream
endobj
5 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000098 00000 n
0000000173 00000 n
0000000803 00000 n
0000000820 00000 n
trailer
<<
/Size 6
/Root 1 0 R
>>
startxref
1048
%%EOF

View file

@ -0,0 +1,25 @@
# API Documentation - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Getting Started
To use the API, first obtain an authentication token from the dashboard.
Include this token in the Authorization header of all requests.
Authentication
All API requests require authent...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: API Documentation
- Author: API Team
- Creator: Word
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,15 @@
Getting Started
To use the API, first obtain an authentication token from the dashboard.
Include this token in the Authorization header of all requests.
Authentication
All API requests require authentication using a Bearer token.
Tokens expire after 24 hours and must be refreshed.
Endpoints
GET /api/users - Retrieve a list of users
POST /api/users - Create a new user
GET /api/users/:id - Retrieve a specific user
PUT /api/users/:id - Update a user
DELETE /api/users/:id - Delete a user
Rate Limits
The API has a rate limit of 1000 requests per hour per user.
Exceeding this limit will result in a 429 Too Many Requests response.

View file

@ -0,0 +1,63 @@
%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
/Title (API Documentation)
/Author (API Team)
/Creator (Word)
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/Font <<
/F1 5 0 R
>>
>>
endobj
4 0 obj
<<
/Filter /FlateDecode
/Length 368
>>
stream
xœmáNÂ0…ÿ÷)îèÈ$$<>»:ÚÙv>½·<C2BD>€&[ÖäœûuçÜ©ƒÇ! G +q“Ã-¿R9†@æ6A¹€ZH <0B>G5Âdµ¸Šœ`wAÅOÇ Tª@Ö@°oh rvŸF´òõÎ*§3±0eÓéH"ÿccBâ2Â:úê 5*<2A>lªiÀá{‡>øLLÎ.ù‡~ é@/¨ó1<C3B3>)*ÇØtqÆ©øã<01>m©K£1Ô¶sžSiØwœr‡L­ú9@atkÉ/æ…„<E280A6>jiÀͰÿÖá ¡XOI«çÍ…õÁ¡
Ñhð<EFBFBD>L´Á=és¢o±¤ŠÊÞ½ÚþãÞ¶º‡&ˬX²øëšaƒ'×:N,iOHöÛ…ZqzpQi¢ƒ ó<?µÜrQ±¥tˆîåX"êØrZm?x ´<ß5!®YÁxtÒZxRæ“Óý.Í·ÖxÌ@¾ŠB~-Àá­
endstream
endobj
5 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000098 00000 n
0000000173 00000 n
0000000715 00000 n
0000000732 00000 n
trailer
<<
/Size 6
/Root 1 0 R
>>
startxref
960
%%EOF

View file

@ -0,0 +1,25 @@
# Product User Manual - CER Test Fixture
## Purpose
This fixture is used for Character Error Rate (CER) testing in the vector PDF corpus.
## Files
- `source.pdf` - Clean vector PDF with embedded text
- `ground_truth.txt` - Exact text content for CER comparison
- `README.md` - This file
## Content
Quick Start Guide
Thank you for purchasing our product. This guide will help you get started.
Unboxing
Carefully remove the product from the packaging.
Check that all items listed on the included card...
## Expected CER
Target: < 0.5% character error rate when extracted by pdftract.
## Metadata
- Title: Product User Manual
- Author: Product Team
- Creator: Word
- Generated by: generate_vector_cer_corpus.py

View file

@ -0,0 +1,19 @@
Quick Start Guide
Thank you for purchasing our product. This guide will help you get started.
Unboxing
Carefully remove the product from the packaging.
Check that all items listed on the included card are present.
Setup
1. Connect the power adapter to a wall outlet.
2. Press and hold the power button for 3 seconds.
3. Follow the on-screen instructions to complete setup.
Features
- Wireless connectivity
- Touch screen interface
- Long battery life
- Compact design
Troubleshooting
If the device does not turn on, ensure the battery is charged.
For connection issues, restart your router and try again.
Support
For additional help, visit support.example.com or call 1-800-SUPPORT.

Binary file not shown.