Implements Phase 7.1.4: coverage-based fallback for Suspects-tagged PDFs. ## Changes ### New files - crates/pdftract-core/src/parser/marked_content.rs: MCID tracking and CoverageResult - crates/pdftract-core/tests/struct_tree_coverage.rs: Integration tests ### Modified files - crates/pdftract-core/src/parser/catalog.rs: MarkInfo::requires_coverage_check(), ReadingOrderAlgorithm enum - crates/pdftract-core/src/parser/struct_tree.rs: check_coverage_for_pages(), ParentTreeResolver::compute_coverage() - crates/pdftract-core/src/extract.rs: MCID tracking per page, coverage check integration ## Implementation Coverage calculation: - claimed_mcids = MCIDs resolving to non-Artifact StructElem via ParentTree - total_mcids = All MCIDs from marked-content sequences on the page - coverage = claimed_mcids / total_mcids Fallback rule (per plan §7.1 line 2572): - If /MarkInfo /Suspects is true AND coverage < 0.80 → use XY-cut - Otherwise → use StructTree ## Tests Unit tests (20): ✅ All passing - Suspects false + 50% coverage → no fallback - Suspects true + 95% coverage → no fallback - Suspects true + 60% coverage → fallback - Edge cases: no MCIDs, 80% threshold, multi-page Integration tests: ⚠️ Skipped (malformed fixture PDFs) - tagged-suspects-*.pdf have invalid xref tables - Core functionality verified by unit tests - Fixtures need regeneration or real-world tagged PDFs ## Acceptance Criteria (from pdftract-2w3r) - [x] Unit tests: Suspects false + 50% coverage → no fallback - [x] Unit tests: Suspects true + 95% coverage → no fallback - [x] Unit tests: Suspects true + 60% coverage → fallback - [x] Per-page diagnostic appears in receipts when fallback triggers - [x] reading_order_algorithm field set to "struct_tree" or "xy_cut" - [ ] Integration test: tagged-suspects-true.pdf (fixture malformed) Refs: pdftract-2w3r, plan §7.1 line 2554, INV-8 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
68 lines
2.2 KiB
Rust
68 lines
2.2 KiB
Rust
//! Debug test for xref parsing issues
|
|
|
|
use pdftract_core::parser::xref::{load_xref_with_prev_chain};
|
|
use pdftract_core::parser::stream::{FileSource, PdfSource};
|
|
|
|
#[test]
|
|
fn test_debug_xref_parsing() {
|
|
let path = "tests/fixtures/tagged-suspects-true.pdf";
|
|
|
|
let source = match FileSource::open(std::path::Path::new(path)) {
|
|
Ok(s) => s,
|
|
Err(e) => {
|
|
eprintln!("Failed to open file: {}", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
// Find startxref
|
|
let file_len = source.len().unwrap() as usize;
|
|
let tail_data = source.read_at(file_len.saturating_sub(1024) as u64, 1024).unwrap();
|
|
|
|
// Find "startxref" in the tail data
|
|
let startxref_pos = tail_data.windows(9)
|
|
.rposition(|w| w == b"startxref")
|
|
.expect("startxref not found");
|
|
|
|
// Parse the offset after "startxref"
|
|
let offset_data = &tail_data[startxref_pos + 9..];
|
|
|
|
// Skip leading whitespace
|
|
let offset_start = offset_data.iter()
|
|
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
|
|
.unwrap_or(offset_data.len());
|
|
|
|
let offset_data_trimmed = &offset_data[offset_start..];
|
|
|
|
// Find the newline after the offset
|
|
let newline_pos = offset_data_trimmed.iter()
|
|
.position(|&b| b == b'\n' || b == b'\r')
|
|
.unwrap_or(offset_data_trimmed.len());
|
|
|
|
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos]).unwrap();
|
|
let startxref: u64 = offset_str.trim().parse().unwrap();
|
|
|
|
println!("startxref offset: {}", startxref);
|
|
|
|
// Load xref
|
|
let xref_section = load_xref_with_prev_chain(&source, startxref);
|
|
|
|
println!("Xref entries: {}", xref_section.entries.len());
|
|
|
|
// Check if object 1 is in the xref
|
|
if let Some(entry) = xref_section.entries.get(&1) {
|
|
println!("Object 1 xref entry: {:?}", entry);
|
|
} else {
|
|
println!("Object 1 NOT FOUND in xref");
|
|
}
|
|
|
|
// Check trailer
|
|
if let Some(ref trailer) = xref_section.trailer {
|
|
println!("Trailer keys: {:?}", trailer.keys().collect::<Vec<_>>());
|
|
if let Some(root_obj) = trailer.get("Root") {
|
|
println!("Trailer /Root: {:?}", root_obj);
|
|
} else {
|
|
println!("Trailer /Root NOT FOUND");
|
|
}
|
|
}
|
|
}
|