pdftract/crates/pdftract-core/tests/test_xref_debug.rs
jedarden e6bf3dd290 feat(pdftract-3s2i): implement Phase 5.5.2 validation filter
Implement per-word validation filter for assisted-OCR BrokenVector path.

Changes:
- Add SpanSource::OcrAssisted variant to hybrid.rs
- Add Span::ocr_assisted() helper method
- Implement validate_ocr_with_position_hints() in ocr.rs
  - 5pt distance threshold for position validation
  - 0.4 confidence cap for rejected words
  - Linear scan for nearest-neighbor lookup
- Add unit tests for validation filter

Closes: pdftract-3s2i

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 04:57:17 -04:00

73 lines
2.2 KiB
Rust

//! Debug test for xref parsing issues
use pdftract_core::parser::stream::{FileSource, PdfSource};
use pdftract_core::parser::xref::load_xref_with_prev_chain;
#[test]
fn test_debug_xref_parsing() {
let path = "tests/fixtures/tagged-suspects-true.pdf";
let source = match FileSource::open(std::path::Path::new(path)) {
Ok(s) => s,
Err(e) => {
eprintln!("Failed to open file: {}", e);
return;
}
};
// Find startxref
let file_len = source.len().unwrap() as usize;
let tail_data = source
.read_at(file_len.saturating_sub(1024) as u64, 1024)
.unwrap();
// Find "startxref" in the tail data
let startxref_pos = tail_data
.windows(9)
.rposition(|w| w == b"startxref")
.expect("startxref not found");
// Parse the offset after "startxref"
let offset_data = &tail_data[startxref_pos + 9..];
// Skip leading whitespace
let offset_start = offset_data
.iter()
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
.unwrap_or(offset_data.len());
let offset_data_trimmed = &offset_data[offset_start..];
// Find the newline after the offset
let newline_pos = offset_data_trimmed
.iter()
.position(|&b| b == b'\n' || b == b'\r')
.unwrap_or(offset_data_trimmed.len());
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos]).unwrap();
let startxref: u64 = offset_str.trim().parse().unwrap();
println!("startxref offset: {}", startxref);
// Load xref
let xref_section = load_xref_with_prev_chain(&source, startxref);
println!("Xref entries: {}", xref_section.entries.len());
// Check if object 1 is in the xref
if let Some(entry) = xref_section.entries.get(&1) {
println!("Object 1 xref entry: {:?}", entry);
} else {
println!("Object 1 NOT FOUND in xref");
}
// Check trailer
if let Some(ref trailer) = xref_section.trailer {
println!("Trailer keys: {:?}", trailer.keys().collect::<Vec<_>>());
if let Some(root_obj) = trailer.get("Root") {
println!("Trailer /Root: {:?}", root_obj);
} else {
println!("Trailer /Root NOT FOUND");
}
}
}