The indent trigger was using .abs() which fired on both increased indent (non-indented → indented) AND decreased indent (indented → non-indented). This caused drop-cap style paragraphs (indented first line, flush-left continuation) to incorrectly split into two blocks. Per plan Phase 4.4 heuristic #2, indent change should only trigger when the current line is MORE indented (to the right, larger x0) than the block average - i.e., a new paragraph starting after non-indented text. It should NOT trigger for decreased indent (first line indented, rest flush-left). Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold. Tests: - test_indented_first_line_new_block: PASS (non-indented → indented splits) - test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together) - All 179 line module tests: PASS
118 lines
4.4 KiB
Rust
118 lines
4.4 KiB
Rust
//! Debug test for fingerprint content stream resolution.
|
|
|
|
use pdftract_core::document::parse_pdf_file;
|
|
use pdftract_core::fingerprint::{compute_fingerprint, ContentStreamData, FingerprintInput, PageFingerprintData};
|
|
use pdftract_core::parser::xref::XrefResolver;
|
|
|
|
#[test]
|
|
fn debug_content_stream_resolution() {
|
|
let cargo_manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
|
|
let base = std::path::Path::new(&cargo_manifest_dir);
|
|
let fixture_path = base
|
|
.parent()
|
|
.and_then(|p| p.parent())
|
|
.unwrap_or(base)
|
|
.join("tests/fingerprint/fixtures/content_edit_one_glyph/v1.pdf");
|
|
|
|
println!("DEBUG: fixture_path = {:?}", fixture_path);
|
|
println!("DEBUG: file exists = {:?}", fixture_path.exists());
|
|
|
|
// Parse the PDF
|
|
let (fingerprint, catalog, pages, resolver) = parse_pdf_file(&fixture_path)
|
|
.expect("Failed to parse PDF");
|
|
|
|
println!("Fingerprint from parse_pdf_file: {}", fingerprint);
|
|
println!("Number of pages: {}", pages.len());
|
|
println!("Catalog pages_ref: {:?}", catalog.pages_ref);
|
|
|
|
// Try to resolve the pages_ref directly
|
|
println!("=== Resolving catalog.pages_ref ===");
|
|
match resolver.resolve(catalog.pages_ref) {
|
|
Ok(obj) => {
|
|
println!(" -> Discriminant: {:?}", std::mem::discriminant(&obj));
|
|
if let Some(dict) = obj.as_dict() {
|
|
println!(" -> IS DICT!");
|
|
for (key, value) in dict.iter().take(10) {
|
|
println!(" {} -> {:?}", key, std::mem::discriminant(value));
|
|
}
|
|
} else if obj.is_null() {
|
|
println!(" -> IS NULL (stub resolver)");
|
|
}
|
|
}
|
|
Err(e) => {
|
|
println!(" -> ERROR: {:?}", e);
|
|
}
|
|
}
|
|
|
|
// Check page content streams
|
|
for (i, page) in pages.iter().enumerate() {
|
|
println!("=== Page {} ===", i);
|
|
println!("Content streams: {}", page.contents.len());
|
|
for (j, &content_ref) in page.contents.iter().enumerate() {
|
|
println!(" Stream {} = {:?}", j, content_ref);
|
|
|
|
// Try to resolve it WITHOUT source (should return Null)
|
|
println!(" Resolve WITHOUT source:");
|
|
match resolver.resolve(content_ref) {
|
|
Ok(obj) => {
|
|
println!(" -> Discriminant: {:?}", std::mem::discriminant(&obj));
|
|
if let Some(stream) = obj.as_stream() {
|
|
println!(" -> IS STREAM! Length: {:?}", stream.dict.get("/Length"));
|
|
println!(" -> Dict: {:?}", stream.dict.iter().map(|(k, v)| (k, std::mem::discriminant(v))).collect::<Vec<_>>());
|
|
} else if obj.is_null() {
|
|
println!(" -> IS NULL (stub resolver)");
|
|
}
|
|
}
|
|
Err(e) => {
|
|
println!(" -> ERROR: {:?}", e);
|
|
}
|
|
}
|
|
}
|
|
println!("MediaBox: {:?}", page.media_box);
|
|
println!("Rotate: {}", page.rotate);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn debug_direct_content_stream_hash() {
|
|
use std::sync::Arc;
|
|
|
|
let resolver = XrefResolver::new();
|
|
|
|
// Test with direct content streams (no source needed)
|
|
let input_v1 = FingerprintInput {
|
|
page_count: 1,
|
|
pages: vec![PageFingerprintData {
|
|
content_streams: vec![ContentStreamData::Direct(b"BT /F1 12 Tf 50 700 Td (Hello World) Tj ET".to_vec())],
|
|
resources: None,
|
|
media_box: [0.0, 0.0, 612.0, 792.0],
|
|
crop_box: None,
|
|
rotate: 0,
|
|
}],
|
|
struct_tree_root_ref: None,
|
|
is_tagged: false,
|
|
catalog_flags: Default::default(),
|
|
};
|
|
|
|
let input_v2 = FingerprintInput {
|
|
page_count: 1,
|
|
pages: vec![PageFingerprintData {
|
|
content_streams: vec![ContentStreamData::Direct(b"BT /F1 12 Tf 50 700 Td (Hello Worl) Tj ET".to_vec())],
|
|
resources: None,
|
|
media_box: [0.0, 0.0, 612.0, 792.0],
|
|
crop_box: None,
|
|
rotate: 0,
|
|
}],
|
|
struct_tree_root_ref: None,
|
|
is_tagged: false,
|
|
catalog_flags: Default::default(),
|
|
};
|
|
|
|
let fp_v1 = compute_fingerprint(&input_v1, &resolver, None);
|
|
let fp_v2 = compute_fingerprint(&input_v2, &resolver, None);
|
|
|
|
println!("Direct content v1 fingerprint: {}", fp_v1);
|
|
println!("Direct content v2 fingerprint: {}", fp_v2);
|
|
|
|
assert_ne!(fp_v1, fp_v2, "Different direct content streams must produce different fingerprints");
|
|
}
|