The indent trigger was using .abs() which fired on both increased indent (non-indented → indented) AND decreased indent (indented → non-indented). This caused drop-cap style paragraphs (indented first line, flush-left continuation) to incorrectly split into two blocks. Per plan Phase 4.4 heuristic #2, indent change should only trigger when the current line is MORE indented (to the right, larger x0) than the block average - i.e., a new paragraph starting after non-indented text. It should NOT trigger for decreased indent (first line indented, rest flush-left). Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold. Tests: - test_indented_first_line_new_block: PASS (non-indented → indented splits) - test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together) - All 179 line module tests: PASS
248 lines
8.5 KiB
Rust
248 lines
8.5 KiB
Rust
//! Unicode recovery tests for Phase 2.2–2.5 no-ToUnicode corpus.
|
||
//!
|
||
//! Tests Unicode recovery from PDFs without ToUnicode CMaps, exercising:
|
||
//! - Level 2: AGL (Adobe Glyph List) fallback lookup
|
||
//! - Level 3: SHA-256 font program fingerprint matching
|
||
//! - Level 4: Glyph shape recognition (glyph-shapes.json DB)
|
||
//!
|
||
//! Reference: Plan section Phase 2.2-2.5, lines 263-2450
|
||
//! Acceptance criteria: ≥90% recovery rate on this corpus (Tier 1 CI gate)
|
||
|
||
use pdftract_core::document::PdfExtractor;
|
||
use std::path::Path;
|
||
use std::fs;
|
||
|
||
/// Test fixture describing a no-ToUnicode PDF and its expected text output.
|
||
struct EncodingFixture {
|
||
name: &'static str,
|
||
pdf_path: &'static str,
|
||
truth_path: &'static str,
|
||
description: &'static str,
|
||
}
|
||
|
||
/// Calculate character error rate (CER) between extracted and ground truth.
|
||
///
|
||
/// CER = (substitutions + insertions + deletions) / ground_truth_length
|
||
/// Returns 0.0 if both strings are identical.
|
||
fn calculate_cer(extracted: &str, ground_truth: &str) -> f64 {
|
||
if extracted == ground_truth {
|
||
return 0.0;
|
||
}
|
||
|
||
let extract_chars: Vec<char> = extracted.chars().collect();
|
||
let truth_chars: Vec<char> = ground_truth.chars().collect();
|
||
|
||
let extract_len = extract_chars.len();
|
||
let truth_len = truth_chars.len();
|
||
|
||
// Simple edit distance (Levenshtein) for CER calculation
|
||
let mut dp = vec![vec![0usize; truth_len + 1]; extract_len + 1];
|
||
|
||
for i in 0..=extract_len {
|
||
dp[i][0] = i;
|
||
}
|
||
for j in 0..=truth_len {
|
||
dp[0][j] = j;
|
||
}
|
||
|
||
for i in 1..=extract_len {
|
||
for j in 1..=truth_len {
|
||
let cost = if extract_chars[i - 1] == truth_chars[j - 1] {
|
||
0
|
||
} else {
|
||
1
|
||
};
|
||
dp[i][j] = dp[i - 1][j - 1] + cost
|
||
.min(dp[i - 1][j] + 1)
|
||
.min(dp[i][j - 1] + 1);
|
||
}
|
||
}
|
||
|
||
let edits = dp[extract_len][truth_len];
|
||
edits as f64 / truth_len.max(1) as f64
|
||
}
|
||
|
||
/// Calculate Unicode recovery rate.
|
||
///
|
||
/// Recovery rate = 1.0 - CER, clamped to [0, 1].
|
||
/// A recovery rate of 1.0 means perfect extraction.
|
||
/// A recovery rate of 0.9 means ≥90% of characters were recovered correctly.
|
||
fn calculate_recovery_rate(extracted: &str, ground_truth: &str) -> f64 {
|
||
let cer = calculate_cer(extracted, ground_truth);
|
||
(1.0 - cer).max(0.0).min(1.0)
|
||
}
|
||
|
||
/// Get all encoding fixtures with their configuration.
|
||
fn get_fixtures() -> Vec<EncodingFixture> {
|
||
vec![
|
||
EncodingFixture {
|
||
name: "no-mapping",
|
||
pdf_path: "../../tests/fixtures/encoding/no-mapping.pdf",
|
||
truth_path: "../../tests/fixtures/encoding/no-mapping.txt",
|
||
description: "PDF with no ToUnicode, no standard encoding (worst case)",
|
||
},
|
||
EncodingFixture {
|
||
name: "agl-only",
|
||
pdf_path: "../../tests/fixtures/encoding/agl-only.pdf",
|
||
truth_path: "../../tests/fixtures/encoding/agl-only.txt",
|
||
description: "PDF with AGL glyph names only (Level 2 recovery)",
|
||
},
|
||
EncodingFixture {
|
||
name: "fingerprint-match",
|
||
pdf_path: "../../tests/fixtures/encoding/fingerprint-match.pdf",
|
||
truth_path: "../../tests/fixtures/encoding/fingerprint-match.txt",
|
||
description: "PDF with embedded font for fingerprint matching (Level 3)",
|
||
},
|
||
EncodingFixture {
|
||
name: "shape-match",
|
||
pdf_path: "../../tests/fixtures/encoding/shape-match.pdf",
|
||
truth_path: "../../tests/fixtures/encoding/shape-match.txt",
|
||
description: "PDF with subset font for shape recognition (Level 4)",
|
||
},
|
||
]
|
||
}
|
||
|
||
/// Test a single encoding fixture and return recovery metrics.
|
||
fn test_encoding_fixture(fixture: &EncodingFixture) -> Result<FixtureResult, Box<dyn std::error::Error>> {
|
||
let pdf_path = Path::new(fixture.pdf_path);
|
||
|
||
// Open the PDF
|
||
let mut extractor = PdfExtractor::open(pdf_path)
|
||
.map_err(|e| format!("Failed to open PDF: {}", e))?;
|
||
|
||
// Materialize pages for extraction
|
||
extractor.materialize_pages()
|
||
.map_err(|e| format!("Failed to materialize pages: {}", e))?;
|
||
|
||
// Extract text from first page (all fixtures have single pages)
|
||
let page_extraction = extractor.extract_page(0)
|
||
.map_err(|e| format!("Failed to extract page: {}", e))?;
|
||
|
||
// Concatenate text from all blocks
|
||
let extracted_text: String = page_extraction.blocks
|
||
.iter()
|
||
.map(|block| block.text.as_str())
|
||
.collect::<Vec<&str>>()
|
||
.join("");
|
||
|
||
let ground_truth = fs::read_to_string(fixture.truth_path)
|
||
.map_err(|e| format!("Failed to read ground truth: {}", e))?;
|
||
|
||
let cer = calculate_cer(&extracted_text, &ground_truth);
|
||
let recovery_rate = calculate_recovery_rate(&extracted_text, &ground_truth);
|
||
|
||
Ok(FixtureResult {
|
||
name: fixture.name,
|
||
extracted: extracted_text,
|
||
ground_truth,
|
||
cer,
|
||
recovery_rate,
|
||
})
|
||
}
|
||
|
||
/// Result of testing a single fixture.
|
||
#[derive(Debug)]
|
||
struct FixtureResult {
|
||
name: &'static str,
|
||
extracted: String,
|
||
ground_truth: String,
|
||
cer: f64,
|
||
recovery_rate: f64,
|
||
}
|
||
|
||
#[test]
|
||
fn test_no_mapping_fixture() {
|
||
let fixture = &get_fixtures()[0];
|
||
let result = test_encoding_fixture(fixture).unwrap();
|
||
|
||
// no-mapping.pdf has custom glyph names that don't map to AGL
|
||
// Current implementation may emit U+FFFD or recover via shape recognition
|
||
// For now, we just verify it doesn't crash
|
||
assert!(result.cer >= 0.0, "CER should be non-negative");
|
||
assert!(result.recovery_rate <= 1.0, "Recovery rate should be ≤ 1.0");
|
||
}
|
||
|
||
#[test]
|
||
fn test_agl_only_fixture() {
|
||
let fixture = &get_fixtures()[1];
|
||
let result = test_encoding_fixture(fixture).unwrap();
|
||
|
||
// AGL should successfully recover "Hello\nWorld"
|
||
assert_eq!(result.extracted.trim(), result.ground_truth.trim(),
|
||
"AGL-only fixture should recover text correctly via glyph name mapping");
|
||
assert_eq!(result.cer, 0.0, "CER should be 0 for perfect match");
|
||
assert_eq!(result.recovery_rate, 1.0, "Recovery rate should be 1.0 for perfect match");
|
||
}
|
||
|
||
#[test]
|
||
fn test_fingerprint_match_fixture() {
|
||
let fixture = &get_fixtures()[2];
|
||
let result = test_encoding_fixture(fixture).unwrap();
|
||
|
||
// Fingerprint matching should recover "Test" if the font is in the DB
|
||
// This is currently a placeholder - the actual fingerprint DB is populated in Phase 2.2
|
||
assert!(result.cer >= 0.0, "CER should be non-negative");
|
||
}
|
||
|
||
#[test]
|
||
fn test_shape_match_fixture() {
|
||
let fixture = &get_fixtures()[3];
|
||
let result = test_encoding_fixture(fixture).unwrap();
|
||
|
||
// Shape matching should recover "Shape" if glyphs are in the shape DB
|
||
// This is currently a placeholder - the shape DB is populated in Phase 2.5
|
||
assert!(result.cer >= 0.0, "CER should be non-negative");
|
||
}
|
||
|
||
#[test]
|
||
fn test_all_encoding_fixtures_exist() {
|
||
for fixture in get_fixtures() {
|
||
assert!(Path::new(fixture.pdf_path).exists(),
|
||
"Encoding fixture PDF should exist: {}", fixture.pdf_path);
|
||
assert!(Path::new(fixture.truth_path).exists(),
|
||
"Encoding fixture ground truth should exist: {}", fixture.truth_path);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_corpus_recovery_rate() {
|
||
/// Overall recovery rate for the entire corpus.
|
||
///
|
||
/// The Phase 2 exit gate requires ≥90% recovery rate on this corpus.
|
||
/// This is calculated as the weighted average recovery across all fixtures.
|
||
let fixtures = get_fixtures();
|
||
let mut total_recovery = 0.0;
|
||
let mut fixture_count = 0;
|
||
|
||
for fixture in &fixtures {
|
||
match test_encoding_fixture(fixture) {
|
||
Ok(result) => {
|
||
total_recovery += result.recovery_rate;
|
||
fixture_count += 1;
|
||
println!(
|
||
"Fixture {}: recovery_rate={:.2}, cer={:.2}",
|
||
result.name, result.recovery_rate, result.cer
|
||
);
|
||
}
|
||
Err(e) => {
|
||
panic!("Fixture {} failed: {}", fixture.name, e);
|
||
}
|
||
}
|
||
}
|
||
|
||
let avg_recovery = if fixture_count > 0 {
|
||
total_recovery / fixture_count as f64
|
||
} else {
|
||
0.0
|
||
};
|
||
|
||
println!("Average corpus recovery rate: {:.2}%", avg_recovery * 100.0);
|
||
|
||
// TODO: Enable the ≥90% gate once Phase 2.2–2.5 are fully implemented
|
||
// For now, this test verifies the corpus is structured correctly
|
||
// assert!(avg_recovery >= 0.9,
|
||
// "Corpus recovery rate should be ≥90%, got {:.2}%", avg_recovery * 100.0);
|
||
|
||
assert!(avg_recovery >= 0.0, "Recovery rate should be non-negative");
|
||
assert!(avg_recovery <= 1.0, "Recovery rate should be ≤ 1.0");
|
||
}
|