pdftract/crates/pdftract-core/tests/struct_tree_coverage.rs
jedarden 68fbbba816 fix(pdftract-4pnmd): build.rs doc comment format string parsing
- Fix format! macro parsing issue in build.rs by extracting doc comment
- Move doc comment with example code outside format! string
- Add verification note for pdftract-4pnmd documenting fallback implementation

Files modified:
- crates/pdftract-core/build.rs: Extract doc comment to fix format! parsing
- notes/pdftract-4pnmd.md: Add verification note

The non-Range server fallback implementation is already complete:
- download_to_temp_and_mmap function downloads entire file to temp
- TempMmapSource wrapper keeps temp file alive
- Fallback logic integrated in open_source and open_remote
- Diagnostics REMOTE_NO_RANGE_SUPPORT and REMOTE_INSUFFICIENT_DISK emitted
- Ureq handles gzip decompression transparently

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 14:36:45 -04:00

232 lines
7.7 KiB
Rust

//! Integration tests for Phase 7.1.4: StructTree coverage check and XY-cut fallback.
//!
//! These tests verify the full extraction pipeline with /MarkInfo /Suspects flag
//! and the coverage-based fallback to XY-cut reading order.
//!
//! Acceptance criteria from pdftract-2w3r:
//! - PDF with Suspects true falls back to XY-cut, reading_order_algorithm = "xy_cut"
//! - Unit tests: Suspects false + 50% coverage -> no fallback
//! - Unit tests: Suspects true + 95% coverage -> no fallback
//! - Unit tests: Suspects true + 60% coverage -> fallback
//! - Per-page diagnostic appears in receipts when fallback triggers
//! - Integration: full pipeline test on tagged-suspects-true.pdf fixture produces expected reading order
use pdftract_core::extract::extract_pdf;
use pdftract_core::options::ExtractionOptions;
use std::path::PathBuf;
/// Get the path to a fixture file, handling both workspace and crate test locations
fn get_fixture_path(fixture_name: &str) -> PathBuf {
// Try workspace root first (when running from workspace)
let workspace_path = PathBuf::from(format!("tests/fixtures/{}", fixture_name));
if workspace_path.exists() {
return workspace_path;
}
// Try from crate directory (when running from crate tests)
let crate_path = PathBuf::from(format!("../../tests/fixtures/{}", fixture_name));
if crate_path.exists() {
return crate_path;
}
// Try using CARGO_MANIFEST_DIR
if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
let from_manifest = PathBuf::from(manifest_dir)
.join("../../tests/fixtures")
.join(fixture_name);
if from_manifest.exists() {
return from_manifest;
}
}
// Fallback: panic with helpful message
panic!(
"Fixture {} not found. Tried:\n 1. {}\n 2. {}\n 3. $CARGO_MANIFEST_DIR/../../tests/fixtures/{}",
fixture_name,
workspace_path.display(),
crate_path.display(),
fixture_name
);
}
#[test]
fn test_suspects_true_fallback_to_xy_cut() {
// Integration test: full pipeline with Suspects true triggers fallback
// This test verifies the acceptance criteria:
// "PDF with Suspects true falls back to XY-cut, reading_order_algorithm = 'xy_cut'"
// For this test, we'll use a mock PDF or fixture if available
// The fixture should have:
// - /MarkInfo /Suspects true
// - StructTree with coverage < 80% (e.g., 60%)
// Note: This test requires a tagged-suspects-true.pdf fixture
// If the fixture doesn't exist, the test will be skipped
let fixture_path = get_fixture_path("tagged-suspects-true.pdf");
if !fixture_path.exists() {
println!("WARNING: Fixture tagged-suspects-true.pdf not found, skipping integration test");
println!("To create this fixture, run: cargo run --manifest-path=tests/fixtures/Cargo.toml --bin generate_suspects_fixture");
return;
}
let options = ExtractionOptions {
receipts: pdftract_core::options::ReceiptsMode::Off,
max_parallel_pages: 1,
memory_budget_mb: 512,
full_render: false,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
max_decompress_bytes: 512 * 1024 * 1024,
output: Default::default(),
pages: None,
password: None,
http_headers: None,
};
let result = extract_pdf(&fixture_path, &options);
match result {
Ok(extraction_result) => {
// Verify reading_order_algorithm is "xy_cut" due to Suspects + low coverage
let algo = extraction_result
.metadata
.reading_order_algorithm
.expect("reading_order_algorithm should be set");
assert_eq!(
algo,
"xy_cut",
"Expected reading_order_algorithm='xy_cut' for Suspects true with low coverage, got '{}'",
algo
);
println!(
"Integration test passed: reading_order_algorithm = '{}'",
algo
);
}
Err(e) => {
panic!("Extraction failed: {}", e);
}
}
}
#[test]
fn test_suspects_false_trusts_tree() {
// Integration test: Suspects false means we trust the StructTree
// even if coverage is low
// This test would require a fixture with:
// - /MarkInfo /Suspects false
// - StructTree with coverage < 80%
// Expected: reading_order_algorithm = "struct_tree"
let fixture_path = get_fixture_path("tagged-suspects-false.pdf");
if !fixture_path.exists() {
println!("WARNING: Fixture tagged-suspects-false.pdf not found, skipping integration test");
return;
}
let options = ExtractionOptions {
receipts: pdftract_core::options::ReceiptsMode::Off,
max_parallel_pages: 1,
memory_budget_mb: 512,
full_render: false,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
max_decompress_bytes: 512 * 1024 * 1024,
output: Default::default(),
pages: None,
password: None,
http_headers: None,
};
let result = extract_pdf(&fixture_path, &options);
match result {
Ok(extraction_result) => {
// Verify reading_order_algorithm is "struct_tree" even with low coverage
let algo = extraction_result
.metadata
.reading_order_algorithm
.expect("reading_order_algorithm should be set");
assert_eq!(
algo, "struct_tree",
"Expected reading_order_algorithm='struct_tree' for Suspects false, got '{}'",
algo
);
println!(
"Integration test passed: reading_order_algorithm = '{}'",
algo
);
}
Err(e) => {
panic!("Extraction failed: {}", e);
}
}
}
#[test]
fn test_suspects_true_high_coverage_no_fallback() {
// Integration test: Suspects true + high coverage (>= 80%) = no fallback
// This test would require a fixture with:
// - /MarkInfo /Suspects true
// - StructTree with coverage >= 80%
// Expected: reading_order_algorithm = "struct_tree"
let fixture_path = get_fixture_path("tagged-suspects-true-high-coverage.pdf");
if !fixture_path.exists() {
println!("WARNING: Fixture tagged-suspects-true-high-coverage.pdf not found, skipping integration test");
return;
}
let options = ExtractionOptions {
receipts: pdftract_core::options::ReceiptsMode::Off,
max_parallel_pages: 1,
memory_budget_mb: 512,
full_render: false,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
max_decompress_bytes: 512 * 1024 * 1024,
output: Default::default(),
pages: None,
password: None,
http_headers: None,
};
let result = extract_pdf(&fixture_path, &options);
match result {
Ok(extraction_result) => {
// Verify reading_order_algorithm is "struct_tree" with high coverage
let algo = extraction_result
.metadata
.reading_order_algorithm
.expect("reading_order_algorithm should be set");
assert_eq!(
algo, "struct_tree",
"Expected reading_order_algorithm='struct_tree' for high coverage, got '{}'",
algo
);
println!(
"Integration test passed: reading_order_algorithm = '{}'",
algo
);
}
Err(e) => {
panic!("Extraction failed: {}", e);
}
}
}