feat(pdftract-5qj50): implement mojibake detection and repair via encoding_rs

Implements Phase 4.7 Correction Pipeline step 3: mojibake detection
and repair for Latin-1 bytes misinterpreted as UTF-8.

Changes:
- Add layout::correction module with detect_and_repair_mojibake function
- Implement CorrectableText trait for mutable text access
- Add trait implementations for hybrid::Span and schema::SpanJson
- Make encoding_rs a non-optional dependency (was cjk-gated)
- Detection heuristic: 2+ occurrences of telltale sequences (é, è, ’, etc.)
- Re-decode via encoding_rs::WINDOWS_1252 when detected
- Accept repair only if readability score improves by >0.05 epsilon
- Fast-path pass-through for ASCII-only and clean UTF-8 text

Closes: pdftract-5qj50

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-24 17:01:53 -04:00
parent b1b7840d9a
commit d84f8da3a4
6 changed files with 489 additions and 2 deletions

1
Cargo.lock generated
View file

@ -2419,6 +2419,7 @@ dependencies = [
"phf_codegen",
"proptest",
"quick-xml",
"rand 0.8.6",
"rayon",
"regex",
"schemars 1.2.1",

View file

@ -38,7 +38,7 @@ tempfile = "3.10"
tracing = { workspace = true }
dashmap = "6.1"
smallvec = "1.13"
encoding_rs = { version = "0.8", optional = true }
encoding_rs = "0.8"
quick-xml = { version = "0.36", optional = true }
serde_yaml = { version = "0.9", optional = true }
@ -54,7 +54,7 @@ profiles = ["dep:serde_yaml"] # Enable extraction profiles (Phase 7.10)
proptest = []
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses
shape-db = [] # Enable glyph shape database (Level 4 encoding fallback)
cjk = ["dep:encoding_rs"] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
cjk = [] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
[dev-dependencies]
chrono = "0.4"

View file

@ -23,6 +23,7 @@
//! IoU = area(A ∩ B) / area(A B)
use crate::classify::{CellIndex, PageClass, PageClassification};
use crate::layout::correction::CorrectableText;
use image::{GrayImage, ImageBuffer, Luma};
use std::collections::BTreeSet;
@ -111,6 +112,16 @@ impl Span {
}
}
impl CorrectableText for Span {
fn text_mut(&mut self) -> &mut String {
&mut self.text
}
fn text(&self) -> &str {
&self.text
}
}
/// Compute the Intersection over Union (IoU) of two bounding boxes.
///
/// IoU = area(A ∩ B) / area(A B)

View file

@ -0,0 +1,461 @@
//! Text correction pipeline (Phase 4.7).
//!
//! This module implements the correction pipeline applied to extracted text
//! before readability scoring. Corrections include:
//! - Mojibake detection and repair (Latin-1 interpreted as UTF-8)
//!
//! # Mojibake Detection
//!
//! Mojibake occurs when UTF-8 text is incorrectly produced from Latin-1 bytes,
//! resulting in garbled output like "café" instead of "café". This module
//! detects such patterns and attempts to recover the original text by
//! re-decoding the bytes as windows-1252.
use encoding_rs::WINDOWS_1252;
/// Trait for types with mutable text content that can be corrected.
///
/// This trait abstracts over different span representations to allow
/// the correction pipeline to work with any span type that has text.
pub trait CorrectableText {
/// Get a mutable reference to the text content.
fn text_mut(&mut self) -> &mut String;
/// Get the text content immutably.
fn text(&self) -> &str;
}
/// Detect and repair mojibake in span text.
///
/// Scans the span's text for sequences characteristic of Latin-1 bytes interpreted
/// as UTF-8 (e.g., `é` for `é`, `’` for `'`). If detected, attempts to
/// re-decode via `encoding_rs` (treat the bytes as windows-1252/Latin-1) and
/// accepts the re-decoded text if the scorer reports a higher readability score.
///
/// # Arguments
///
/// * `span` - Mutable reference to a span with text to check/repair
/// * `scorer` - Callback that computes a readability score for text [0.0, 1.0]
///
/// # Returns
///
/// `true` if the span text was replaced with re-decoded text, `false` otherwise.
///
/// # Detection Heuristic
///
/// Checks for at least 2 occurrences of any telltale 2-char sequences:
/// - `é` `è` `à ` `î` `ô` `û` `â` `ç` `ñ` (common French/Spanish chars)
/// - `’` `â€"` `“` `â€` (smart quotes / em-dash from Windows-1252)
/// - `Â` followed by a non-ASCII char (NBSP and similar)
///
/// # Correction Process
///
/// 1. Encode the current text as UTF-8 bytes
/// 2. Decode those bytes as windows-1252 (the actual encoding)
/// 3. Score both original and candidate text
/// 4. If `candidate_score > original_score + 0.05`: accept the replacement
///
/// # Epsilon Threshold
///
/// The 0.05 epsilon prevents noise from triggering unnecessary re-decoding.
/// Only readability improvements greater than 5% are accepted.
///
/// # Invariants
///
/// - **INV**: Re-decoding is REVERTED if it doesn't improve readability (false-positive safety).
/// - **INV**: A clean ASCII or pure UTF-8 span (no Ã/â sequences) passes through unchanged.
/// - **INV**: The encoding is windows-1252, not pure Latin-1 (covers smart quotes and Microsoft-isms).
///
/// # Examples
///
/// ```
/// use pdftract_core::layout::correction::{detect_and_repair_mojibake, TestCorrectable};
///
/// // Clean UTF-8 text: no detection
/// let mut span = TestCorrectable::new("café");
/// let repaired = detect_and_repair_mojibake(&mut span, |s| simple_score(s));
/// assert!(!repaired);
/// assert_eq!(span.text(), "café");
///
/// // Mojibake: detected and repaired
/// let mut span = TestCorrectable::new("café");
/// let repaired = detect_and_repair_mojibake(&mut span, |s| {
/// // Mock scorer that prefers corrected text
/// if s.contains("é") { 0.3 } else { 0.9 }
/// });
/// assert!(repaired);
/// assert_eq!(span.text(), "café");
/// ```
pub fn detect_and_repair_mojibake<T, F>(span: &mut T, scorer: F) -> bool
where
T: CorrectableText,
F: Fn(&str) -> f32,
{
let text = span.text();
// Fast-path: empty or ASCII-only text cannot be mojibake
if text.is_empty() || text.is_ascii() {
return false;
}
// Detection heuristic: check for telltale Latin-1-as-UTF-8 sequences
if !contains_mojibake_indicators(text) {
return false;
}
// Attempt re-decoding: encode as UTF-8, then decode as windows-1252
let utf8_bytes = text.as_bytes();
let (candidate, _) = WINDOWS_1252.decode_without_bom_handling(utf8_bytes);
// Score both versions
let original_score = scorer(text);
let candidate_score = scorer(&candidate);
// Accept replacement only if score improves by > epsilon
const EPSILON: f32 = 0.05;
if candidate_score > original_score + EPSILON {
*span.text_mut() = candidate.to_string();
true
} else {
false
}
}
/// Check if text contains mojibake indicator sequences.
///
/// Returns true if at least 2 occurrences of any telltale 2-char patterns
/// are found. The threshold reduces false positives on legitimate text.
///
/// # Indicator Patterns
///
/// - `é` `è` `ê` `î` `ô` `û` `â` `ç` `ñ` - Latin-1 vowels with diacritics
/// - `’` `â€"` `“` `â€` - Smart quotes and dashes from Windows-1252
/// - `Â` followed by non-ASCII - NBSP and related
fn contains_mojibake_indicators(text: &str) -> bool {
const INDICATORS: &[&str] = &[
// Latin-1 vowels with diacritics (common French/Spanish/Portuguese)
"é",
"è",
"ê",
"î",
"ô",
"û",
"â",
"ç",
"ñ",
"ã",
"ú",
"Ã\u{ad}",
"ó",
"á",
// Smart quotes and dashes from Windows-1252
"’",
"â€\"",
"“",
"â€",
"â€\u{00a0}",
"‡",
];
let mut count = 0;
let chars: Vec<char> = text.chars().collect();
// Check for 2-char sequences
for i in 0..chars.len().saturating_sub(1) {
let pair: String = chars[i..=i + 1].iter().collect();
if INDICATORS.contains(&pair.as_str()) {
count += 1;
if count >= 2 {
return true;
}
}
}
// Check for  followed by non-ASCII
for i in 0..chars.len().saturating_sub(1) {
if chars[i] == 'Â' && !chars[i + 1].is_ascii() {
count += 1;
if count >= 2 {
return true;
}
}
}
false
}
/// Test implementation of `CorrectableText` for unit tests.
#[cfg(test)]
pub struct TestCorrectable {
text: String,
}
#[cfg(test)]
impl TestCorrectable {
pub fn new(text: impl Into<String>) -> Self {
Self { text: text.into() }
}
pub fn text(&self) -> &str {
&self.text
}
}
#[cfg(test)]
impl CorrectableText for TestCorrectable {
fn text_mut(&mut self) -> &mut String {
&mut self.text
}
fn text(&self) -> &str {
&self.text
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Simple mock scorer that returns 1.0 for clean text, 0.3 for mojibake.
fn simple_scorer(text: &str) -> f32 {
// Check for common mojibake patterns
if text.contains("\u{00c3}\u{00a9}") || // é
text.contains("\u{00c3}\u{00a8}") || // è
text.contains("\u{00e2}\u{20ac}\u{2122}")
{
// ’ (smart quote)
0.3
} else {
0.9
}
}
#[test]
fn test_clean_utf8_no_change() {
// Clean UTF-8 text: no mojibake sequences
let mut span = TestCorrectable::new("caf\u{00e9}");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(!repaired);
assert_eq!(span.text(), "caf\u{00e9}");
}
#[test]
fn test_ascii_only_no_change() {
// ASCII-only text: cannot be mojibake
let mut span = TestCorrectable::new("hello world");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(!repaired);
assert_eq!(span.text(), "hello world");
}
#[test]
fn test_empty_string_no_change() {
let mut span = TestCorrectable::new("");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(!repaired);
assert_eq!(span.text(), "");
}
#[test]
fn test_mojibake_detected_and_repaired() {
// "café" is mojibake for "café" - Latin-1 interpreted as UTF-8
// In UTF-8, é is 0xC3 0xA9. If those bytes are interpreted as windows-1252,
// we get "é". Re-encoding those as UTF-8 bytes and decoding as windows-1252
// should recover the original "é".
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}"); // café
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(repaired);
assert_eq!(span.text(), "caf\u{00e9}"); // café
}
#[test]
fn test_mojibake_multiple_indicators() {
// Multiple indicators: éè (café + è)
let mut span =
TestCorrectable::new("caf\u{00c3}\u{00a9} r\u{00c3}\u{00a8}st\u{00c3}\u{00a9}");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(repaired);
// Should re-decode to "café résté"
assert_eq!(span.text(), "caf\u{00e9} r\u{00e9}st\u{00e9}");
}
#[test]
fn test_mojibake_single_indicator_threshold() {
// Single é without other indicators: below threshold
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}sandbar");
// With only 1 é, the threshold of 2 is not met
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(!repaired); // Should not detect with only 1 indicator
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}sandbar");
}
#[test]
fn test_smart_quote_mojibake() {
// Smart quote mojibake
let mojibake = "don\u{2019}t"; // don't with curly apostrophe
let mut span = TestCorrectable::new(mojibake);
let repaired =
detect_and_repair_mojibake(
&mut span,
|s| {
if s.contains("\u{2019}") {
0.3
} else {
0.9
}
},
);
assert!(repaired);
assert_eq!(span.text(), "don't");
}
#[test]
fn test_em_dash_mojibake() {
// em dash mojibake test
let mojibake = "hello\u{2014}world"; // â€" pattern
let mut span = TestCorrectable::new(mojibake);
let repaired =
detect_and_repair_mojibake(
&mut span,
|s| {
if s.contains("\u{2014}") {
0.3
} else {
0.9
}
},
);
assert!(repaired);
// Should decode to proper em dash
assert!(span.text().contains("\u{2014}"));
}
#[test]
fn test_replacement_rejected_if_score_doesnt_improve() {
// Even with mojibake indicators, don't replace if score doesn't improve
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}");
let repaired = detect_and_repair_mojibake(&mut span, |_| 0.5); // Both score 0.5
// No replacement because candidate_score (0.5) is not > original_score (0.5) + 0.05
assert!(!repaired);
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}");
}
#[test]
fn test_epsilon_threshold_prevents_noise() {
// Candidate score only slightly better - should be rejected
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}");
let repaired = detect_and_repair_mojibake(&mut span, |s| {
if s.contains("\u{00c3}\u{00a9}") {
0.7
} else {
0.74
} // Only 0.04 improvement
});
// 0.74 is not > 0.7 + 0.05 (0.75), so no replacement
assert!(!repaired);
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}");
}
#[test]
fn test_asian_text_unaffected() {
// Asian text (no Latin-1 indicators): pass-through
let mut span = TestCorrectable::new("こんにちは世界");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(!repaired);
assert_eq!(span.text(), "こんにちは世界");
}
#[test]
fn test_windows1252_specific() {
// Test that we use windows-1252, not pure Latin-1
// Smart quote is the windows-1252 smart quote, not in pure Latin-1
let mojibake = "it\u{2019}s"; // it's with smart quote
let mut span = TestCorrectable::new(mojibake);
let repaired =
detect_and_repair_mojibake(
&mut span,
|s| {
if s.contains("\u{2019}") {
0.3
} else {
0.9
}
},
);
assert!(repaired);
assert_eq!(span.text(), "it's");
}
#[test]
fn test_mixed_ascii_and_mojibake() {
// Mixed content: some ASCII, some mojibake
let mut span = TestCorrectable::new("The word is caf\u{00e9} and r\u{00e9}sum\u{00e9}");
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(repaired);
assert_eq!(
span.text(),
"The word is caf\u{00e9} and r\u{00e9}sum\u{00e9}"
);
}
#[test]
fn test_nbsp_indicator() {
// NBSP pattern: \u{00a0} followed by non-ASCII
let mut span = TestCorrectable::new("hello\u{00a0} world\u{00a0} here");
let repaired =
detect_and_repair_mojibake(
&mut span,
|s| {
if s.contains("\u{00a0} ") {
0.3
} else {
0.9
}
},
);
assert!(repaired);
// NBSP + space should be handled
assert!(!span.text().contains("\u{00a0} "));
}
#[test]
fn test_multiple_mojibake_patterns() {
// Multiple different indicators: curly quote + accent
let mojibake = "don\u{2019}t drink caf\u{00e9}";
let mut span = TestCorrectable::new(mojibake);
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
assert!(repaired);
assert_eq!(span.text(), "don't drink caf\u{00e9}");
}
#[test]
fn test_exact_epsilon_boundary() {
// Test the exact epsilon boundary
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}");
let repaired = detect_and_repair_mojibake(&mut span, |s| {
if s.contains("\u{00c3}\u{00a9}") {
0.70
} else {
0.75
} // Exactly 0.05 improvement
});
// 0.75 is NOT > 0.70 + 0.05 (0.75), so no replacement (strict inequality)
assert!(!repaired);
}
#[test]
fn test_just_above_epsilon() {
// Just above epsilon threshold
let mut span = TestCorrectable::new("caf\u{00c3}\u{00a9}");
let repaired = detect_and_repair_mojibake(&mut span, |s| {
if s.contains("\u{00c3}\u{00a9}") {
0.70
} else {
0.751
} // 0.051 improvement
});
// 0.751 > 0.70 + 0.05 (0.75), so replacement happens
assert!(repaired);
assert_eq!(span.text(), "caf\u{00e9}");
}
}

View file

@ -7,6 +7,7 @@
//! - Line formation (line.rs)
//! - Readability aggregation (readability.rs)
//! - English wordlist for dict coverage scoring (wordlist.rs)
//! - Text correction pipeline (correction.rs)
//!
//! Phase 4 organizes extracted text into semantic blocks (paragraphs,
//! headings, figures, captions, etc.) based on spatial and font metrics.
@ -14,6 +15,7 @@
pub mod caption;
pub mod code;
pub mod columns;
pub mod correction;
pub mod line;
pub mod readability;
pub mod wordlist;
@ -24,6 +26,7 @@ pub use code::{
is_monospace_span, MonospaceSpan,
};
pub use columns::{assign_columns_to_lines, assign_columns_to_spans, Column};
pub use correction::detect_and_repair_mojibake;
pub use line::{
cluster_spans_into_lines, compute_baseline, group_lines_into_blocks, union_bboxes, BlockInput,
HasBBox, HasFontSize, Line, LineDirection, LineMetadata,

View file

@ -21,6 +21,7 @@ use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_json::json;
use crate::layout::correction::CorrectableText;
use crate::receipts::Receipt;
use crate::signature::Signature;
@ -76,6 +77,16 @@ pub struct SpanJson {
pub column: Option<u32>,
}
impl CorrectableText for SpanJson {
fn text_mut(&mut self) -> &mut String {
&mut self.text
}
fn text(&self) -> &str {
&self.text
}
}
/// JSON representation of a structural block.
///
/// A block is a higher-level semantic unit composed of one or more