Implement per-word validation filter for assisted-OCR BrokenVector path. Changes: - Add SpanSource::OcrAssisted variant to hybrid.rs - Add Span::ocr_assisted() helper method - Implement validate_ocr_with_position_hints() in ocr.rs - 5pt distance threshold for position validation - 0.4 confidence cap for rejected words - Linear scan for nearest-neighbor lookup - Add unit tests for validation filter Closes: pdftract-3s2i Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
490 lines
16 KiB
Rust
490 lines
16 KiB
Rust
//! Page classification fixture tests.
|
|
//!
|
|
//! This module tests the page classification system against the 4 critical
|
|
//! fixtures in tests/fixtures/page_class/:
|
|
//! - vector_pure: Pure text PDF (born-digital)
|
|
//! - scanned_single: Image-only PDF (scanned)
|
|
//! - brokenvector_pdfa: PDF/A with invisible text over image
|
|
//! - hybrid_header_body: Text header + scanned body (hybrid)
|
|
//!
|
|
//! Acceptance criteria (from plan.md Phase 5.1):
|
|
//! - All 4 fixtures classify correctly
|
|
//! - Confidence >= confidence_min for each fixture
|
|
//! - Reproducibility: classifying the same fixture twice produces identical JSON output
|
|
|
|
use std::fs;
|
|
use std::path::{Path, PathBuf};
|
|
|
|
/// Fixture directory containing page classification test cases
|
|
const FIXTURE_DIR: &str = "tests/fixtures/page_class";
|
|
|
|
/// Expected classification from fixture's expected.json
|
|
#[derive(Debug, serde::Deserialize)]
|
|
struct ExpectedClassification {
|
|
/// Expected page class
|
|
class: String,
|
|
/// Minimum confidence threshold
|
|
confidence_min: f32,
|
|
/// For Hybrid: array of cell indices, null for non-hybrid
|
|
hybrid_cells: Option<Vec<usize>>,
|
|
}
|
|
|
|
/// Page classification fixture
|
|
struct Fixture {
|
|
/// Fixture name (directory name)
|
|
name: String,
|
|
/// Path to source PDF
|
|
pdf_path: PathBuf,
|
|
/// Expected classification
|
|
expected: ExpectedClassification,
|
|
}
|
|
|
|
/// Get the fixture directory path, handling both workspace and crate test locations
|
|
fn get_fixture_dir() -> PathBuf {
|
|
// Try workspace root first (when running from workspace)
|
|
let workspace_path = Path::new(FIXTURE_DIR);
|
|
if workspace_path.exists() {
|
|
return workspace_path.to_path_buf();
|
|
}
|
|
|
|
// Try from crate directory (when running from crate tests)
|
|
let crate_path = Path::new("../../tests/fixtures/page_class");
|
|
if crate_path.exists() {
|
|
return crate_path.to_path_buf();
|
|
}
|
|
|
|
// Try using CARGO_MANIFEST_DIR
|
|
if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
|
let from_manifest = PathBuf::from(manifest_dir).join("../../tests/fixtures/page_class");
|
|
if from_manifest.exists() {
|
|
return from_manifest;
|
|
}
|
|
}
|
|
|
|
// Fallback: panic with helpful message
|
|
panic!(
|
|
"Fixture directory not found. Tried:\n 1. {}\n 2. {}\n 3. $CARGO_MANIFEST_DIR/../../tests/fixtures/page_class",
|
|
workspace_path.display(),
|
|
crate_path.display()
|
|
);
|
|
}
|
|
|
|
/// Discover all page classification fixtures
|
|
fn discover_fixtures() -> Vec<Fixture> {
|
|
let fixtures_base = get_fixture_dir();
|
|
let mut fixtures = Vec::new();
|
|
|
|
let entries = fs::read_dir(fixtures_base)
|
|
.unwrap_or_else(|e| panic!("Failed to read fixture directory {}: {e}", FIXTURE_DIR));
|
|
|
|
for entry in entries {
|
|
let entry = entry.expect("Failed to read directory entry");
|
|
let path = entry.path();
|
|
|
|
// Skip non-directories
|
|
if !path.is_dir() {
|
|
continue;
|
|
}
|
|
|
|
let name = path
|
|
.file_name()
|
|
.expect("No file name")
|
|
.to_string_lossy()
|
|
.to_string();
|
|
|
|
let pdf_path = path.join("source.pdf");
|
|
let expected_path = path.join("expected.json");
|
|
|
|
// Skip if required files are missing
|
|
if !pdf_path.exists() {
|
|
eprintln!("WARNING: Missing source.pdf in {name}");
|
|
continue;
|
|
}
|
|
if !expected_path.exists() {
|
|
eprintln!("WARNING: Missing expected.json in {name}");
|
|
continue;
|
|
}
|
|
|
|
// Read expected.json
|
|
let expected_json = fs::read_to_string(&expected_path)
|
|
.unwrap_or_else(|e| panic!("Failed to read expected.json in {name}: {e}"));
|
|
let expected: ExpectedClassification = serde_json::from_str(&expected_json)
|
|
.unwrap_or_else(|e| panic!("Failed to parse expected.json in {name}: {e}"));
|
|
|
|
fixtures.push(Fixture {
|
|
name,
|
|
pdf_path,
|
|
expected,
|
|
});
|
|
}
|
|
|
|
// Sort for deterministic order
|
|
fixtures.sort_by(|a, b| a.name.cmp(&b.name));
|
|
|
|
fixtures
|
|
}
|
|
|
|
/// Create a mock PageContext for a fixture based on its expected classification.
|
|
///
|
|
/// This is a simplified implementation that creates the appropriate PageContext
|
|
/// to trigger the expected classification. In a full integration test, this would
|
|
/// parse the actual PDF and analyze its content streams.
|
|
fn create_page_context_for_fixture(fixture: &Fixture) -> pdftract_core::classify::PageContext {
|
|
use pdftract_core::classify::{CellData, PageContext};
|
|
|
|
match fixture.expected.class.as_str() {
|
|
"Vector" => {
|
|
// Pure vector: high text ops, high char validity, no images
|
|
let mut ctx = PageContext::new();
|
|
ctx.text_op_count = 500;
|
|
ctx.raw_char_count = 3000;
|
|
ctx.valid_char_count = 2900;
|
|
ctx.invisible_text_count = 0;
|
|
ctx.replacement_char_count = 50;
|
|
ctx.image_coverage = 0.0;
|
|
ctx.has_full_page_image = false;
|
|
ctx.has_visible_text = true;
|
|
ctx.density_ratio = 0.95;
|
|
ctx.width = 612.0;
|
|
ctx.height = 792.0;
|
|
ctx.rotation = 0;
|
|
ctx.grid_cells = None;
|
|
ctx
|
|
}
|
|
"Scanned" => {
|
|
// Scanned: no text ops, high image coverage
|
|
let mut ctx = PageContext::new();
|
|
ctx.text_op_count = 0;
|
|
ctx.raw_char_count = 0;
|
|
ctx.valid_char_count = 0;
|
|
ctx.invisible_text_count = 0;
|
|
ctx.replacement_char_count = 0;
|
|
ctx.image_coverage = 0.95;
|
|
ctx.has_full_page_image = true;
|
|
ctx.has_visible_text = false;
|
|
ctx.density_ratio = 0.0;
|
|
ctx.width = 612.0;
|
|
ctx.height = 792.0;
|
|
ctx.rotation = 0;
|
|
ctx.grid_cells = None;
|
|
ctx
|
|
}
|
|
"BrokenVector" => {
|
|
// BrokenVector: invisible text + full-page image
|
|
let mut ctx = PageContext::new();
|
|
ctx.text_op_count = 100;
|
|
ctx.raw_char_count = 1000;
|
|
ctx.valid_char_count = 1000;
|
|
ctx.invisible_text_count = 100; // All text is Tr=3
|
|
ctx.replacement_char_count = 0;
|
|
ctx.image_coverage = 0.95;
|
|
ctx.has_full_page_image = true;
|
|
ctx.has_visible_text = false;
|
|
ctx.density_ratio = 0.30;
|
|
ctx.width = 612.0;
|
|
ctx.height = 792.0;
|
|
ctx.rotation = 0;
|
|
ctx.grid_cells = None;
|
|
ctx
|
|
}
|
|
"Hybrid" => {
|
|
// Hybrid: text header + scanned body (grid-based detection)
|
|
let mut ctx = PageContext::new();
|
|
ctx.text_op_count = 200;
|
|
ctx.raw_char_count = 1500;
|
|
ctx.valid_char_count = 1400;
|
|
ctx.invisible_text_count = 0;
|
|
ctx.replacement_char_count = 50;
|
|
ctx.image_coverage = 0.70;
|
|
ctx.has_full_page_image = false;
|
|
ctx.has_visible_text = true;
|
|
ctx.density_ratio = 0.50;
|
|
ctx.width = 612.0;
|
|
ctx.height = 792.0;
|
|
ctx.rotation = 0;
|
|
|
|
// Set up grid cells: top 2 rows vector, bottom 6 rows scanned
|
|
let cells: [CellData; 64] = std::array::from_fn(|i| {
|
|
let row = i / 8;
|
|
if row < 2 {
|
|
// Vector cells (text header)
|
|
CellData {
|
|
text_op_count: 15,
|
|
image_coverage: 0.05,
|
|
char_validity: 0.95,
|
|
}
|
|
} else {
|
|
// Scanned cells (body)
|
|
CellData {
|
|
text_op_count: 0,
|
|
image_coverage: 0.90,
|
|
char_validity: 0.0,
|
|
}
|
|
}
|
|
});
|
|
ctx.grid_cells = Some(cells);
|
|
|
|
ctx
|
|
}
|
|
_ => {
|
|
panic!("Unknown expected class: {}", fixture.expected.class);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Convert PageClass enum to string for comparison
|
|
fn page_class_to_string(class: pdftract_core::classify::PageClass) -> String {
|
|
match class {
|
|
pdftract_core::classify::PageClass::Vector => "Vector".to_string(),
|
|
pdftract_core::classify::PageClass::Scanned => "Scanned".to_string(),
|
|
pdftract_core::classify::PageClass::Hybrid => "Hybrid".to_string(),
|
|
pdftract_core::classify::PageClass::BrokenVector => "BrokenVector".to_string(),
|
|
}
|
|
}
|
|
|
|
/// Test that all fixtures classify correctly
|
|
#[test]
|
|
fn test_page_classification_fixtures() {
|
|
let fixtures = discover_fixtures();
|
|
|
|
assert!(
|
|
fixtures.len() >= 4,
|
|
"Expected at least 4 fixtures, found {}",
|
|
fixtures.len()
|
|
);
|
|
|
|
println!("Testing {} page classification fixtures:", fixtures.len());
|
|
|
|
for fixture in &fixtures {
|
|
println!(" - {}", fixture.name);
|
|
|
|
// Create PageContext for this fixture
|
|
let ctx = create_page_context_for_fixture(fixture);
|
|
|
|
// Classify the page
|
|
let result = pdftract_core::classify::classify_page(&ctx);
|
|
|
|
// Convert class to string
|
|
let result_class_str = page_class_to_string(result.class);
|
|
|
|
// Check classification matches expected
|
|
assert_eq!(
|
|
result_class_str, fixture.expected.class,
|
|
"Fixture '{}' classified as {:?}, expected {}",
|
|
fixture.name, result.class, fixture.expected.class
|
|
);
|
|
|
|
// Check confidence threshold
|
|
assert!(
|
|
result.confidence >= fixture.expected.confidence_min,
|
|
"Fixture '{}' confidence {} below threshold {}",
|
|
fixture.name,
|
|
result.confidence,
|
|
fixture.expected.confidence_min
|
|
);
|
|
|
|
// For Hybrid: check hybrid_cells presence and content
|
|
if fixture.expected.class == "Hybrid" {
|
|
assert!(
|
|
result.hybrid_cells.is_some(),
|
|
"Fixture '{}' expected hybrid_cells to be present, but got None",
|
|
fixture.name
|
|
);
|
|
// Verify hybrid_cells matches expected
|
|
let expected_cells: std::collections::BTreeSet<usize> = fixture
|
|
.expected
|
|
.hybrid_cells
|
|
.as_ref()
|
|
.expect("Hybrid fixture must have hybrid_cells array")
|
|
.iter()
|
|
.copied()
|
|
.collect();
|
|
assert_eq!(
|
|
result.hybrid_cells.as_ref().unwrap(),
|
|
&expected_cells,
|
|
"Fixture '{}' hybrid_cells mismatch",
|
|
fixture.name
|
|
);
|
|
} else {
|
|
// Non-Hybrid classifications should not have hybrid_cells
|
|
assert!(
|
|
result.hybrid_cells.is_none(),
|
|
"Fixture '{}' (non-Hybrid) has unexpected hybrid_cells: {:?}",
|
|
fixture.name,
|
|
result.hybrid_cells
|
|
);
|
|
}
|
|
}
|
|
|
|
println!("All fixtures passed!");
|
|
}
|
|
|
|
/// Test reproducibility: classifying the same fixture twice produces identical JSON output
|
|
#[test]
|
|
fn test_page_classification_reproducibility() {
|
|
let fixtures = discover_fixtures();
|
|
|
|
for fixture in &fixtures {
|
|
// Create PageContext for this fixture
|
|
let ctx = create_page_context_for_fixture(fixture);
|
|
|
|
// Classify twice
|
|
let result1 = pdftract_core::classify::classify_page(&ctx);
|
|
let result2 = pdftract_core::classify::classify_page(&ctx);
|
|
|
|
// Serialize both results to JSON
|
|
let json1 = serde_json::to_string_pretty(&result1).expect("Failed to serialize result1");
|
|
let json2 = serde_json::to_string_pretty(&result2).expect("Failed to serialize result2");
|
|
|
|
// Assert byte-identical
|
|
assert_eq!(
|
|
json1, json2,
|
|
"Fixture '{}' produced different JSON on second classification\n\
|
|
First: {}\n\
|
|
Second: {}",
|
|
fixture.name, json1, json2
|
|
);
|
|
}
|
|
|
|
println!(
|
|
"Reproducibility check passed for {} fixtures",
|
|
fixtures.len()
|
|
);
|
|
}
|
|
|
|
/// Test that fixture files exist and total size < 1 MB
|
|
#[test]
|
|
fn test_fixture_files_exist_and_size() {
|
|
let fixtures = discover_fixtures();
|
|
let mut total_size = 0u64;
|
|
|
|
for fixture in &fixtures {
|
|
// Check PDF exists
|
|
assert!(
|
|
fixture.pdf_path.exists(),
|
|
"Fixture '{}' PDF not found: {}",
|
|
fixture.name,
|
|
fixture.pdf_path.display()
|
|
);
|
|
|
|
// Check PDF is not empty
|
|
let metadata = fixture
|
|
.pdf_path
|
|
.metadata()
|
|
.expect("Failed to get PDF metadata");
|
|
assert!(
|
|
metadata.len() > 0,
|
|
"Fixture '{}' PDF is empty",
|
|
fixture.name
|
|
);
|
|
|
|
total_size += metadata.len();
|
|
|
|
println!(" {}: {} bytes", fixture.name, metadata.len());
|
|
}
|
|
|
|
println!(
|
|
"Total fixture size: {} bytes ({} MB)",
|
|
total_size,
|
|
total_size as f64 / 1024.0 / 1024.0
|
|
);
|
|
|
|
// Check total size < 1 MB
|
|
assert!(
|
|
total_size < 1_000_000,
|
|
"Total fixture size {} bytes exceeds 1 MB limit",
|
|
total_size
|
|
);
|
|
}
|
|
|
|
/// Test that expected.json files are valid
|
|
#[test]
|
|
fn test_expected_json_validity() {
|
|
let fixtures = discover_fixtures();
|
|
|
|
for fixture in &fixtures {
|
|
// Verify confidence_min is in valid range [0.0, 1.0]
|
|
assert!(
|
|
fixture.expected.confidence_min >= 0.0 && fixture.expected.confidence_min <= 1.0,
|
|
"Fixture '{}' has invalid confidence_min: {}",
|
|
fixture.name,
|
|
fixture.expected.confidence_min
|
|
);
|
|
|
|
// Verify class is one of the expected values
|
|
let valid_classes = ["Vector", "Scanned", "Hybrid", "BrokenVector"];
|
|
assert!(
|
|
valid_classes.contains(&fixture.expected.class.as_str()),
|
|
"Fixture '{}' has invalid class: {}",
|
|
fixture.name,
|
|
fixture.expected.class
|
|
);
|
|
}
|
|
|
|
println!("All expected.json files are valid");
|
|
}
|
|
|
|
/// Test that reproducibility gate fails on intentional perturbation.
|
|
///
|
|
/// This verifies that the reproducibility check is working correctly
|
|
/// by intentionally perturbing a confidence value and asserting the
|
|
/// test fails with a clear diff.
|
|
#[test]
|
|
fn test_reproducibility_gate_with_perturbation() {
|
|
use pdftract_core::classify::{classify_page, PageContext};
|
|
|
|
// Create a page context for a vector page
|
|
let mut ctx = PageContext::new();
|
|
ctx.text_op_count = 500;
|
|
ctx.raw_char_count = 3000;
|
|
ctx.valid_char_count = 2900;
|
|
ctx.image_coverage = 0.0;
|
|
ctx.density_ratio = 0.95;
|
|
ctx.has_visible_text = true;
|
|
|
|
// Classify twice
|
|
let result1 = classify_page(&ctx);
|
|
let mut result2 = classify_page(&ctx);
|
|
|
|
// Intentionally perturb the confidence
|
|
result2.confidence += 0.01;
|
|
|
|
// Serialize both results to JSON
|
|
let json1 = serde_json::to_string_pretty(&result1).expect("Failed to serialize result1");
|
|
let json2 = serde_json::to_string_pretty(&result2).expect("Failed to serialize result2");
|
|
|
|
// This should fail because we perturbed the confidence
|
|
let result = std::panic::catch_unwind(|| {
|
|
assert_eq!(
|
|
json1, json2,
|
|
"Reproducibility gate should fail on perturbation\nFirst: {}\nSecond: {}",
|
|
json1, json2
|
|
);
|
|
});
|
|
|
|
// Verify the test did panic (reproducibility gate caught the perturbation)
|
|
assert!(
|
|
result.is_err(),
|
|
"Reproducibility gate should have failed on perturbation"
|
|
);
|
|
|
|
// Verify the error message contains the diff
|
|
if let Err(panic_payload) = result {
|
|
let panic_msg = if let Some(s) = panic_payload.downcast_ref::<String>() {
|
|
s.clone()
|
|
} else if let Some(s) = panic_payload.downcast_ref::<&str>() {
|
|
(*s).to_string()
|
|
} else {
|
|
"Unknown panic message".to_string()
|
|
};
|
|
assert!(
|
|
panic_msg.contains("Reproducibility gate should fail on perturbation")
|
|
|| panic_msg.contains("assertion `left == right` failed")
|
|
|| panic_msg.contains("assert_eq!")
|
|
|| panic_msg.contains("First:")
|
|
|| panic_msg.contains("Second:"),
|
|
"Panic message should contain diff information, got: {}",
|
|
panic_msg
|
|
);
|
|
}
|
|
}
|