Collects in-progress work across forms (Ch/Tx field handling, value_text edge cases), layout corrections, stream parser fixes, conformance test expansion, security audit test (TH-08), stream-decoder bomb fixture, debug examples reorganization under examples/debug/, sdk module scaffold, xtask CLI enhancements, and provenance entries for new fixtures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1991 lines
74 KiB
Rust
1991 lines
74 KiB
Rust
//! Text correction pipeline (Phase 4.7).
|
||
//!
|
||
//! This module implements the correction pipeline applied to extracted text
|
||
//! before readability scoring. Corrections include:
|
||
//! - Mojibake detection and repair (Latin-1 interpreted as UTF-8)
|
||
//! - Hyphenation repair (end-of-line hyphen joined with next line)
|
||
//! - Word-break normalization (zero-width characters stripped or preserved per script)
|
||
//!
|
||
//! # Mojibake Detection
|
||
//!
|
||
//! Mojibake occurs when UTF-8 text is incorrectly produced from Latin-1 bytes,
|
||
//! resulting in garbled output like "café" instead of "café". This module
|
||
//! detects such patterns and attempts to recover the original text by
|
||
//! re-decoding the bytes as windows-1252.
|
||
|
||
use encoding_rs::WINDOWS_1252;
|
||
|
||
use crate::font::UnicodeSource;
|
||
use crate::glyph::Glyph;
|
||
use crate::layout::line::{Block, Line, LineMetadata};
|
||
use crate::span::Span;
|
||
|
||
/// Unicode script category for word-break normalization.
|
||
///
|
||
/// Simplified script detection based on Unicode codepoint ranges.
|
||
/// Used to determine whether zero-width joiner/non-joiner characters
|
||
/// should be preserved (they're orthographic in complex scripts) or
|
||
/// stripped (they're noise in Latin text).
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||
pub enum Script {
|
||
/// Arabic script - requires ZWNJ/ZWJ for correct rendering
|
||
Arabic,
|
||
/// Hebrew script - may use ZWNJ/ZWJ
|
||
Hebrew,
|
||
/// Devanagari (Hindi, Marathi, Nepali, Sanskrit) - requires ZWNJ/ZWJ for conjuncts
|
||
Devanagari,
|
||
/// Bengali script - requires ZWNJ/ZWJ for conjuncts
|
||
Bengali,
|
||
/// Other Indic scripts (Gurmukhi, Gujarati, Tamil, Telugu, Kannada, Malayalam, Odia)
|
||
Indic,
|
||
/// Thai script - may use ZWNJ/ZWJ
|
||
Thai,
|
||
/// Lao script
|
||
Lao,
|
||
/// Tibetan script
|
||
Tibetan,
|
||
/// Myanmar (Burmese) script
|
||
Myanmar,
|
||
/// Khmer script
|
||
Khmer,
|
||
/// Sinhala script
|
||
Sinhala,
|
||
/// Latin and other simple scripts - ZWNJ/ZWJ are noise
|
||
Latin,
|
||
/// Unknown script - default to strip (safe default)
|
||
Unknown,
|
||
}
|
||
|
||
impl Script {
|
||
/// Returns true if this script uses ZWNJ/ZWJ for legitimate orthographic purposes.
|
||
///
|
||
/// Complex scripts like Arabic, Indic, and Southeast Asian scripts use
|
||
/// zero-width joiner/non-joiner characters to control ligature formation
|
||
/// and conjunct rendering. Stripping these breaks the text.
|
||
pub fn preserves_joiners(self) -> bool {
|
||
matches!(
|
||
self,
|
||
Self::Arabic
|
||
| Self::Hebrew
|
||
| Self::Devanagari
|
||
| Self::Bengali
|
||
| Self::Indic
|
||
| Self::Thai
|
||
| Self::Lao
|
||
| Self::Tibetan
|
||
| Self::Myanmar
|
||
| Self::Khmer
|
||
| Self::Sinhala
|
||
)
|
||
}
|
||
}
|
||
|
||
/// Detect the dominant script from text content.
|
||
///
|
||
/// Scans the text and returns the first script category that matches
|
||
/// a significant number of characters. Returns `Script::Latin` for
|
||
/// ASCII/Latin text and `Script::Unknown` for empty text.
|
||
///
|
||
/// # Detection Priority
|
||
///
|
||
/// Scripts are checked in priority order (Arabic first, then Devanagari, etc.).
|
||
/// The first script with >=3 matching characters is returned. If no script
|
||
/// reaches the threshold, returns `Script::Latin` as a safe default.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::layout::correction::detect_script;
|
||
///
|
||
/// assert_eq!(detect_script("Hello world"), Script::Latin);
|
||
/// assert_eq!(detect_script("مرحبا"), Script::Arabic);
|
||
/// assert_eq!(detect_script("नमस्ते"), Script::Devanagari);
|
||
/// assert_eq!(detect_script(""), Script::Unknown);
|
||
/// ```
|
||
pub fn detect_script(text: &str) -> Script {
|
||
if text.is_empty() {
|
||
return Script::Unknown;
|
||
}
|
||
|
||
let mut arabic_count = 0;
|
||
let mut hebrew_count = 0;
|
||
let mut devanagari_count = 0;
|
||
let mut bengali_count = 0;
|
||
let mut indic_count = 0;
|
||
let mut thai_count = 0;
|
||
let mut lao_count = 0;
|
||
let mut tibetan_count = 0;
|
||
let mut myanmar_count = 0;
|
||
let mut khmer_count = 0;
|
||
let mut sinhala_count = 0;
|
||
|
||
for c in text.chars() {
|
||
let cp = c as u32;
|
||
match cp {
|
||
// Arabic: U+0600..U+06FF, U+0750..U+077F, U+08A0..U+08FF
|
||
0x0600..=0x06FF | 0x0750..=0x077F | 0x08A0..=0x08FF => arabic_count += 1,
|
||
// Hebrew: U+0590..U+05FF
|
||
0x0590..=0x05FF => hebrew_count += 1,
|
||
// Devanagari: U+0900..U+097F
|
||
0x0900..=0x097F => devanagari_count += 1,
|
||
// Bengali: U+0980..U+09FF
|
||
0x0980..=0x09FF => bengali_count += 1,
|
||
// Other Indic scripts:
|
||
// Gurmukhi: U+0A00..U+0A7F
|
||
// Gujarati: U+0A80..U+0AFF
|
||
// Tamil: U+0B80..U+0BFF
|
||
// Telugu: U+0C00..U+0C7F
|
||
// Kannada: U+0C80..U+0CFF
|
||
// Malayalam: U+0D00..U+0D7F
|
||
// Odia: U+0B00..U+0B7F
|
||
0x0A00..=0x0A7F | 0x0A80..=0x0AFF | 0x0B00..=0x0B7F | 0x0B80..=0x0BFF |
|
||
0x0C00..=0x0C7F | 0x0C80..=0x0CFF | 0x0D00..=0x0D7F => indic_count += 1,
|
||
// Thai: U+0E00..U+0E7F
|
||
0x0E00..=0x0E7F => thai_count += 1,
|
||
// Lao: U+0E80..U+0EFF
|
||
0x0E80..=0x0EFF => lao_count += 1,
|
||
// Tibetan: U+0F00..U+0FFF
|
||
0x0F00..=0x0FFF => tibetan_count += 1,
|
||
// Myanmar: U+1000..U+109F
|
||
0x1000..=0x109F => myanmar_count += 1,
|
||
// Khmer: U+1780..U+17FF
|
||
0x1780..=0x17FF => khmer_count += 1,
|
||
// Sinhala: U+0D80..U+0DFF
|
||
0x0D80..=0x0DFF => sinhala_count += 1,
|
||
_ => {}
|
||
}
|
||
}
|
||
|
||
const THRESHOLD: usize = 3;
|
||
|
||
if arabic_count >= THRESHOLD {
|
||
return Script::Arabic;
|
||
}
|
||
if hebrew_count >= THRESHOLD {
|
||
return Script::Hebrew;
|
||
}
|
||
if devanagari_count >= THRESHOLD {
|
||
return Script::Devanagari;
|
||
}
|
||
if bengali_count >= THRESHOLD {
|
||
return Script::Bengali;
|
||
}
|
||
if indic_count >= THRESHOLD {
|
||
return Script::Indic;
|
||
}
|
||
if thai_count >= THRESHOLD {
|
||
return Script::Thai;
|
||
}
|
||
if lao_count >= THRESHOLD {
|
||
return Script::Lao;
|
||
}
|
||
if tibetan_count >= THRESHOLD {
|
||
return Script::Tibetan;
|
||
}
|
||
if myanmar_count >= THRESHOLD {
|
||
return Script::Myanmar;
|
||
}
|
||
if khmer_count >= THRESHOLD {
|
||
return Script::Khmer;
|
||
}
|
||
if sinhala_count >= THRESHOLD {
|
||
return Script::Sinhala;
|
||
}
|
||
|
||
// Default to Latin for ASCII or undetected scripts
|
||
Script::Latin
|
||
}
|
||
|
||
/// Normalize word-break characters in span text based on script hint.
|
||
///
|
||
/// Strips zero-width formatting characters that are noise in extracted text:
|
||
/// - **U+200B** (zero-width space): ALWAYS stripped (never content)
|
||
/// - **U+FEFF** (zero-width no-break space / BOM): ALWAYS stripped (never content)
|
||
/// - **U+200C** (zero-width non-joiner): stripped unless script requires it
|
||
/// - **U+200D** (zero-width joiner): stripped unless script requires it
|
||
///
|
||
/// The script_hint determines whether ZWNJ/ZWJ are preserved:
|
||
/// - **Arabic, Hebrew, Indic, Thai, Lao, Tibetan, Myanmar, Khmer, Sinhala**:
|
||
/// ZWNJ/ZWJ are preserved (they control ligature/conjunct formation)
|
||
/// - **Latin or Unknown**: All four characters are stripped
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `span` - Mutable reference to the span to normalize
|
||
/// * `script_hint` - Optional script hint; if None, detects from span text
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// Count of characters stripped (u32).
|
||
///
|
||
/// # Invariants
|
||
///
|
||
/// - **INV**: U+200B and U+FEFF are NEVER content; always stripped regardless of script.
|
||
/// - **INV**: U+200C/U+200D are content in Arabic/Indic; stripping breaks rendering.
|
||
/// - **INV**: When script_hint is None, script is detected from the span's own text.
|
||
/// - **INV**: For unknown-script text, default to strip (safer for Latin output).
|
||
///
|
||
/// # Performance
|
||
///
|
||
/// O(n) where n is the length of the span text. Uses `String::retain` with
|
||
/// a closure that checks the script hint once.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::layout::correction::{normalize_word_breaks, Script};
|
||
/// use pdftract_core::span::Span;
|
||
/// use std::sync::Arc;
|
||
///
|
||
/// // Latin text: all zero-width chars stripped
|
||
/// let mut span = Span::empty();
|
||
/// span.text = String::from("auto\u{200B}mation");
|
||
/// let count = normalize_word_breaks(&mut span, Some(Script::Latin));
|
||
/// assert_eq!(count, 1);
|
||
/// assert_eq!(span.text, "automation");
|
||
///
|
||
/// // Arabic text: ZWNJ/ZWJ preserved
|
||
/// let mut span = Span::empty();
|
||
/// span.text = String::from("\u{0627}\u{200C}\u{200D}"); // alef + ZWNJ + ZWJ
|
||
/// let count = normalize_word_breaks(&mut span, Some(Script::Arabic));
|
||
/// assert_eq!(count, 0);
|
||
/// assert_eq!(span.text, "\u{0627}\u{200C}\u{200D}");
|
||
///
|
||
/// // Unknown script: all stripped (safe default)
|
||
/// let mut span = Span::empty();
|
||
/// span.text = String::from("test\u{200C}\u{200D}");
|
||
/// let count = normalize_word_breaks(&mut span, None);
|
||
/// assert_eq!(count, 2);
|
||
/// assert_eq!(span.text, "test");
|
||
/// ```
|
||
pub fn normalize_word_breaks(span: &mut Span, script_hint: Option<Script>) -> u32 {
|
||
let script = script_hint.unwrap_or_else(|| detect_script(&span.text));
|
||
let preserve_joiners = script.preserves_joiners();
|
||
|
||
let original_len = span.text.len();
|
||
|
||
span.text.retain(|c| {
|
||
match c {
|
||
// U+200B zero-width space: ALWAYS strip
|
||
'\u{200B}' => false,
|
||
// U+FEFF BOM: ALWAYS strip
|
||
'\u{FEFF}' => false,
|
||
// U+200C ZWNJ: strip unless script requires it
|
||
'\u{200C}' => preserve_joiners,
|
||
// U+200D ZWJ: strip unless script requires it
|
||
'\u{200D}' => preserve_joiners,
|
||
// All other characters: keep
|
||
_ => true,
|
||
}
|
||
});
|
||
|
||
// Return count of stripped characters by byte length difference
|
||
(original_len - span.text.len()) as u32
|
||
}
|
||
|
||
/// Trait for types with mutable text content that can be corrected.
|
||
///
|
||
/// This trait abstracts over different span representations to allow
|
||
/// the correction pipeline to work with any span type that has text.
|
||
pub trait CorrectableText {
|
||
/// Get a mutable reference to the text content.
|
||
fn text_mut(&mut self) -> &mut String;
|
||
|
||
/// Get the text content immutably.
|
||
fn text(&self) -> &str;
|
||
}
|
||
|
||
/// Detect and repair mojibake in span text.
|
||
///
|
||
/// Scans the span's text for sequences characteristic of Latin-1 bytes interpreted
|
||
/// as UTF-8 (e.g., `é` for `é`, `’` for `'`). If detected, attempts to
|
||
/// re-decode via `encoding_rs` (treat the bytes as windows-1252/Latin-1) and
|
||
/// accepts the re-decoded text if the scorer reports a higher readability score.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `span` - Mutable reference to a span with text to check/repair
|
||
/// * `scorer` - Callback that computes a readability score for text [0.0, 1.0]
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// `true` if the span text was replaced with re-decoded text, `false` otherwise.
|
||
///
|
||
/// # Detection Heuristic
|
||
///
|
||
/// Checks for at least 2 occurrences of any telltale 2-char sequences:
|
||
/// - `é` `è` `à ` `î` `ô` `û` `â` `ç` `ñ` (common French/Spanish chars)
|
||
/// - `’` `â€"` `“` `â€` (smart quotes / em-dash from Windows-1252)
|
||
/// - `Â` followed by a non-ASCII char (NBSP and similar)
|
||
///
|
||
/// # Correction Process
|
||
///
|
||
/// 1. Encode the current text as UTF-8 bytes
|
||
/// 2. Decode those bytes as windows-1252 (the actual encoding)
|
||
/// 3. Score both original and candidate text
|
||
/// 4. If `candidate_score > original_score + 0.05`: accept the replacement
|
||
///
|
||
/// # Epsilon Threshold
|
||
///
|
||
/// The 0.05 epsilon prevents noise from triggering unnecessary re-decoding.
|
||
/// Only readability improvements greater than 5% are accepted.
|
||
///
|
||
/// # Invariants
|
||
///
|
||
/// - **INV**: Re-decoding is REVERTED if it doesn't improve readability (false-positive safety).
|
||
/// - **INV**: A clean ASCII or pure UTF-8 span (no Ã/â sequences) passes through unchanged.
|
||
/// - **INV**: The encoding is windows-1252, not pure Latin-1 (covers smart quotes and Microsoft-isms).
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::layout::correction::{detect_and_repair_mojibake, TestCorrectable};
|
||
///
|
||
/// // Clean UTF-8 text: no detection
|
||
/// let mut span = TestCorrectable::new("café");
|
||
/// let repaired = detect_and_repair_mojibake(&mut span, |s| simple_score(s));
|
||
/// assert!(!repaired);
|
||
/// assert_eq!(span.text(), "café");
|
||
///
|
||
/// // Mojibake: detected and repaired
|
||
/// let mut span = TestCorrectable::new("café");
|
||
/// let repaired = detect_and_repair_mojibake(&mut span, |s| {
|
||
/// // Mock scorer that prefers corrected text
|
||
/// if s.contains("é") { 0.3 } else { 0.9 }
|
||
/// });
|
||
/// assert!(repaired);
|
||
/// assert_eq!(span.text(), "café");
|
||
/// ```
|
||
pub fn detect_and_repair_mojibake<T, F>(span: &mut T, scorer: F) -> bool
|
||
where
|
||
T: CorrectableText,
|
||
F: Fn(&str) -> f32,
|
||
{
|
||
let text = span.text();
|
||
|
||
// Fast-path: empty or ASCII-only text cannot be mojibake
|
||
if text.is_empty() || text.is_ascii() {
|
||
return false;
|
||
}
|
||
|
||
// Detection heuristic: check for telltale Latin-1-as-UTF-8 sequences
|
||
if !contains_mojibake_indicators(text) {
|
||
return false;
|
||
}
|
||
|
||
// Attempt re-decoding: encode as UTF-8, then decode as windows-1252
|
||
let utf8_bytes = text.as_bytes();
|
||
let (candidate, _) = WINDOWS_1252.decode_without_bom_handling(utf8_bytes);
|
||
|
||
// Score both versions
|
||
let original_score = scorer(text);
|
||
let candidate_score = scorer(&candidate);
|
||
|
||
// Accept replacement only if score improves by > epsilon
|
||
const EPSILON: f32 = 0.05;
|
||
if candidate_score > original_score + EPSILON {
|
||
*span.text_mut() = candidate.to_string();
|
||
true
|
||
} else {
|
||
false
|
||
}
|
||
}
|
||
|
||
/// Check if text contains mojibake indicator sequences.
|
||
///
|
||
/// Returns true if at least 2 occurrences of any telltale 2-char patterns
|
||
/// are found. The threshold reduces false positives on legitimate text.
|
||
///
|
||
/// # Indicator Patterns
|
||
///
|
||
/// - `é` `è` `ê` `î` `ô` `û` `â` `ç` `ñ` - Latin-1 vowels with diacritics
|
||
/// - `’` `â€"` `“` `â€` - Smart quotes and dashes from Windows-1252
|
||
/// - `Â` followed by non-ASCII - NBSP and related
|
||
fn contains_mojibake_indicators(text: &str) -> bool {
|
||
const INDICATORS: &[&str] = &[
|
||
// Latin-1 vowels with diacritics (common French/Spanish/Portuguese)
|
||
"é",
|
||
"è",
|
||
"ê",
|
||
"î",
|
||
"ô",
|
||
"û",
|
||
"â",
|
||
"ç",
|
||
"ñ",
|
||
"ã",
|
||
"ú",
|
||
"Ã\u{ad}",
|
||
"ó",
|
||
"á",
|
||
// Smart quotes and dashes from Windows-1252
|
||
"’",
|
||
"â€\"",
|
||
"“",
|
||
"â€",
|
||
"â€\u{00a0}",
|
||
"‡",
|
||
];
|
||
|
||
let mut count = 0;
|
||
let chars: Vec<char> = text.chars().collect();
|
||
|
||
// Check for 2-char sequences
|
||
for i in 0..chars.len().saturating_sub(1) {
|
||
let pair: String = chars[i..=i + 1].iter().collect();
|
||
if INDICATORS.contains(&pair.as_str()) {
|
||
count += 1;
|
||
if count >= 2 {
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check for  followed by non-ASCII
|
||
for i in 0..chars.len().saturating_sub(1) {
|
||
if chars[i] == 'Â' && !chars[i + 1].is_ascii() {
|
||
count += 1;
|
||
if count >= 2 {
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// Trait for types with bounding box information needed for hyphenation repair.
|
||
///
|
||
/// This trait abstracts over different span representations to allow
|
||
/// the hyphenation repair code to work with any span type that has position data.
|
||
pub trait HasBBox {
|
||
/// Get the bounding box [x0, y0, x1, y1] in PDF user space.
|
||
fn bbox(&self) -> [f64; 4];
|
||
}
|
||
|
||
/// Trait for types that have mutable text content and position data.
|
||
///
|
||
/// Combines `CorrectableText` with `HasBBox` for spans that need
|
||
/// hyphenation repair.
|
||
pub trait HyphenableSpan: CorrectableText + HasBBox {}
|
||
|
||
/// Blanket implementation for types that implement both traits.
|
||
impl<T> HyphenableSpan for T where T: CorrectableText + HasBBox {}
|
||
|
||
/// Repair end-of-line hyphenation within a block.
|
||
///
|
||
/// Detects, within a single block, lines ending with a hyphen at or near the
|
||
/// column right edge (text ends with `-`, span bbox.x1 is within `0.05 * column_width`
|
||
/// of column right) AND the next line in the same block starts with a lowercase letter
|
||
/// (continuation). Joins: strip the trailing hyphen from line N's last span, prepend
|
||
/// its truncated word to the first word of line N+1's first span.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `block` - Mutable reference to a block with lines to repair
|
||
/// * `column_width` - Width of the column in points (used to detect right-edge hyphens)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// Count of repairs performed (u32).
|
||
///
|
||
/// # Detection Criteria
|
||
///
|
||
/// A hyphenation repair is performed when ALL of the following are true:
|
||
/// 1. line\[n\].last_span.text ends with `-`, `‐` (U+2010), or `‑` (U+2011)
|
||
/// 2. line\[n\].last_span.bbox\[2\] >= column_right - 0.05 * column_width (hyphen at right edge)
|
||
/// 3. line\[n+1\].first_span.text starts with a LOWERCASE letter (continuation)
|
||
/// 4. line\[n\].last_span and line\[n+1\].first_span are in the same column
|
||
///
|
||
/// # Repair Process
|
||
///
|
||
/// 1. Find the last word in line\[n\].last_span.text; strip the trailing hyphen
|
||
/// 2. Find the first word in line\[n+1\].first_span.text
|
||
/// 3. Join: `joined_word = stripped_last + first`
|
||
/// 4. Modify line\[n\].last_span.text: replace hyphenated word with `joined_word + " "`
|
||
/// 5. Modify line\[n+1\].first_span.text: remove the first word
|
||
/// 6. If line\[n+1\].first_span becomes empty, remove it; if line becomes empty, remove it
|
||
///
|
||
/// # Invariants
|
||
///
|
||
/// - **INV**: do NOT join across blocks (paragraph boundary kills hyphenation)
|
||
/// - **INV**: capital-start of next line indicates NOT a continuation (new sentence)
|
||
/// - **INV**: mid-line hyphens (not at right edge) are NOT joined
|
||
/// - **INV**: lines in different columns are NOT joined
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::layout::correction::{repair_hyphenation, TestSpan, TestLine};
|
||
///
|
||
/// let mut block = TestBlock {
|
||
/// lines: vec![
|
||
/// TestLine {
|
||
/// spans: vec![TestSpan::new("Long hyphen-", [50.0, 100.0, 445.0, 115.0])],
|
||
/// column: Some(0),
|
||
/// ..Default::default()
|
||
/// },
|
||
/// TestLine {
|
||
/// spans: vec![TestSpan::new("ation continues", [50.0, 85.0, 200.0, 100.0])],
|
||
/// column: Some(0),
|
||
/// ..Default::default()
|
||
/// },
|
||
/// ],
|
||
/// column: 0,
|
||
/// };
|
||
///
|
||
/// let count = repair_hyphenation(&mut block, 500.0);
|
||
/// assert_eq!(count, 1);
|
||
/// assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
/// assert_eq!(block.lines[1].spans[0].text(), "continues");
|
||
/// ```
|
||
pub fn repair_hyphenation<S>(block: &mut Block<S>, column_width: f64) -> u32
|
||
where
|
||
S: HyphenableSpan,
|
||
{
|
||
let mut repair_count = 0;
|
||
let column_right = (block.column as f64 + 1.0) * column_width;
|
||
let right_edge_threshold = 0.05 * column_width;
|
||
|
||
// Iterate consecutive line pairs within the block
|
||
let mut i = 0;
|
||
while i + 1 < block.lines.len() {
|
||
let current_line = &block.lines[i];
|
||
let next_line = &block.lines[i + 1];
|
||
|
||
// Both lines must have spans
|
||
if current_line.spans.is_empty() || next_line.spans.is_empty() {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
|
||
let current_last_span = ¤t_line.spans[current_line.spans.len() - 1];
|
||
let next_first_span = &next_line.spans[0];
|
||
|
||
// Check: same column
|
||
if current_line.column != next_line.column {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
|
||
// Check: hyphen at end of current line's last span
|
||
let current_text = current_last_span.text();
|
||
let has_hyphen = current_text.ends_with('-')
|
||
|| current_text.ends_with('\u{2010}') // hyphen
|
||
|| current_text.ends_with('\u{2011}') // non-breaking hyphen
|
||
|| current_text.ends_with('\u{00AD}'); // soft hyphen
|
||
|
||
if !has_hyphen {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
|
||
// Check: hyphen is at right edge of column
|
||
let last_span_bbox = current_last_span.bbox();
|
||
if last_span_bbox[2] < column_right - right_edge_threshold {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
|
||
// Check: next line starts with lowercase (continuation)
|
||
let next_text = next_first_span.text();
|
||
let first_char = next_text.chars().next();
|
||
let is_continuation = match first_char {
|
||
Some(c) => c.is_lowercase(),
|
||
None => false,
|
||
};
|
||
|
||
if !is_continuation {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
|
||
// All checks passed - perform the repair
|
||
// Extract data first to avoid multiple mutable borrows
|
||
let (last_word_end, joined_word, first_word_end) = {
|
||
let current_last_span = ¤t_line.spans[current_line.spans.len() - 1];
|
||
let current_text = current_last_span.text();
|
||
|
||
let last_word_end = current_text
|
||
.rfind(char::is_whitespace)
|
||
.map(|pos| pos + 1)
|
||
.unwrap_or(0);
|
||
let last_word = ¤t_text[last_word_end..];
|
||
|
||
// Strip trailing hyphen(s) and whitespace
|
||
let stripped_last = last_word.trim_end_matches(|c: char| {
|
||
c == '-'
|
||
|| c == '\u{2010}'
|
||
|| c == '\u{2011}'
|
||
|| c == '\u{00AD}'
|
||
|| c.is_whitespace()
|
||
});
|
||
|
||
// Find first word in next span
|
||
let next_first_span = &next_line.spans[0];
|
||
let next_text = next_first_span.text();
|
||
let first_word_end = next_text
|
||
.find(char::is_whitespace)
|
||
.unwrap_or(next_text.len());
|
||
let first_word = &next_text[..first_word_end];
|
||
|
||
// Join the words
|
||
let joined_word = format!("{}{}", stripped_last, first_word);
|
||
|
||
(last_word_end, joined_word, first_word_end)
|
||
};
|
||
|
||
// Apply mutations to current line
|
||
{
|
||
let current_line_mut = &mut block.lines[i];
|
||
let last_span_idx = current_line_mut.spans.len() - 1;
|
||
let current_last_span_mut = &mut current_line_mut.spans[last_span_idx];
|
||
let current_text_mut = current_last_span_mut.text_mut();
|
||
|
||
// Replace last word in current span
|
||
let before_last_word = ¤t_text_mut[..last_word_end];
|
||
*current_text_mut = format!("{}{} ", before_last_word, joined_word);
|
||
}
|
||
|
||
// Apply mutations to next line
|
||
{
|
||
let next_line_mut = &mut block.lines[i + 1];
|
||
let next_first_span_mut = &mut next_line_mut.spans[0];
|
||
let next_text_mut = next_first_span_mut.text_mut();
|
||
|
||
// Remove first word from next span
|
||
let after_first_word = &next_text_mut[first_word_end..];
|
||
let after_first_word_trimmed = after_first_word.trim_start();
|
||
*next_text_mut = after_first_word_trimmed.to_string();
|
||
|
||
// Clean up: remove empty spans/lines
|
||
if next_first_span_mut.text().is_empty() {
|
||
next_line_mut.spans.remove(0);
|
||
}
|
||
if next_line_mut.spans.is_empty() {
|
||
block.lines.remove(i + 1);
|
||
// Don't increment i - recheck current line with new next line
|
||
continue;
|
||
}
|
||
}
|
||
|
||
repair_count += 1;
|
||
i += 1;
|
||
}
|
||
|
||
repair_count
|
||
}
|
||
|
||
/// Ligature type for reconstruction from split glyphs.
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||
enum Ligature {
|
||
/// "fi" ligature
|
||
Fi,
|
||
/// "fl" ligature
|
||
Fl,
|
||
/// "ffi" ligature
|
||
Ffi,
|
||
/// "ffl" ligature
|
||
Ffl,
|
||
/// "ff" ligature
|
||
Ff,
|
||
}
|
||
|
||
impl Ligature {
|
||
/// Get the decomposed string representation of this ligature.
|
||
fn decomposed(self) -> &'static str {
|
||
match self {
|
||
Ligature::Fi => "fi",
|
||
Ligature::Fl => "fl",
|
||
Ligature::Ffi => "ffi",
|
||
Ligature::Ffl => "ffl",
|
||
Ligature::Ff => "ff",
|
||
}
|
||
}
|
||
|
||
/// Check if the given character is a ligature component (f, l, i).
|
||
fn is_component(c: char) -> bool {
|
||
matches!(c, 'f' | 'l' | 'i')
|
||
}
|
||
}
|
||
|
||
/// Positional gap threshold for ligature detection (in points).
|
||
///
|
||
/// Glyphs with gap < LIGATURE_GAP_THRESHOLD are considered adjacent
|
||
/// and potentially part of the same ligature.
|
||
const LIGATURE_GAP_THRESHOLD: f32 = 0.1;
|
||
|
||
/// Repair split ligatures in span text using adjacent glyph position data.
|
||
///
|
||
/// Detects sequences where U+FFFD is adjacent (positional gap < 0.1pt) to f/l/i,
|
||
/// indicating a split ligature that Phase 2 failed to map. Reconstructs the
|
||
/// ligature by verifying positional adjacency and replaces U+FFFD with the
|
||
/// correct decomposed characters.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `span` - Mutable reference to the span to repair
|
||
/// * `neighbor_glyphs` - Slice of glyphs with position data for adjacency checking
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// `true` if any repair was performed, `false` otherwise.
|
||
///
|
||
/// # Algorithm
|
||
///
|
||
/// 1. Walk span.text for U+FFFD characters
|
||
/// 2. For each U+FFFD, check preceding and following characters in the text
|
||
/// 3. Map character position to glyph index (handles char-to-glyph mapping)
|
||
/// 4. Verify positional adjacency using glyph bbox data (gap < 0.1pt)
|
||
/// 5. Determine ligature type based on character context
|
||
/// 6. Replace U+FFFD with decomposed ligature string
|
||
///
|
||
/// # Ligature Detection
|
||
///
|
||
/// Ligatures are detected when ALL of the following are true:
|
||
/// - U+FFFD is adjacent to f/l/i in the text (e.g., "f<U+FFFD>i" or "<U+FFFD>i")
|
||
/// - The corresponding glyph bboxes have gap < 0.1pt (indicating same ligature)
|
||
/// - Character context matches a known ligature pattern
|
||
///
|
||
/// # v0.1.0 Limitations
|
||
///
|
||
/// - Full shape matching against Phase 2.5 DB requires bitmap data not available
|
||
/// in the Glyph struct; this implementation uses position-based heuristics
|
||
/// - Assumes approximate 1:1 char-to-glyph mapping (may fail on complex scripts)
|
||
/// - Does not handle multi-codepoint ligatures like U+FB01 (fi) directly
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::layout::correction::repair_split_ligatures;
|
||
/// use pdftract_core::span::Span;
|
||
///
|
||
/// let mut span = Span::empty();
|
||
/// span.text = String::from("f\u{FFFD}ect"); // "f[REPLACEMENT]ect"
|
||
///
|
||
/// // With glyphs showing 'f' adjacent to U+FFFD glyph (gap < 0.1pt),
|
||
/// // and next char 'i' in text, this repairs to "fiect"
|
||
/// let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
/// assert!(repaired);
|
||
/// assert_eq!(span.text, "fiect");
|
||
/// ```
|
||
pub fn repair_split_ligatures(span: &mut Span, neighbor_glyphs: &[Glyph]) -> bool {
|
||
let original_text = span.text.clone();
|
||
let mut modified = false;
|
||
|
||
// Fast-path: no U+FFFD in text or no glyphs
|
||
if !span.text.contains('\u{FFFD}') || neighbor_glyphs.is_empty() {
|
||
return false;
|
||
}
|
||
|
||
let mut result = String::new();
|
||
let chars: Vec<char> = span.text.chars().collect();
|
||
|
||
// Build char-to-glyph index mapping
|
||
// This handles the approximate mapping from character positions to glyph indices
|
||
let mut char_to_glyph: Vec<usize> = Vec::with_capacity(chars.len());
|
||
let mut glyph_idx = 0;
|
||
|
||
for (char_idx, &ch) in chars.iter().enumerate() {
|
||
// Skip until we find a matching glyph
|
||
while glyph_idx < neighbor_glyphs.len() && neighbor_glyphs[glyph_idx].codepoint != ch {
|
||
glyph_idx += 1;
|
||
}
|
||
|
||
if glyph_idx < neighbor_glyphs.len() {
|
||
char_to_glyph.push(glyph_idx);
|
||
// Move to next glyph for next character (if not U+FFFD)
|
||
if ch != '\u{FFFD}' {
|
||
glyph_idx += 1;
|
||
}
|
||
} else {
|
||
// No matching glyph found - use last valid index or -1
|
||
char_to_glyph.push(usize::MAX);
|
||
}
|
||
}
|
||
|
||
// Process each character
|
||
for (i, &ch) in chars.iter().enumerate() {
|
||
if ch != '\u{FFFD}' {
|
||
result.push(ch);
|
||
continue;
|
||
}
|
||
|
||
// Found U+FFFD - check if it's a split ligature
|
||
let prev_char = if i > 0 { Some(chars[i - 1]) } else { None };
|
||
let next_char = if i + 1 < chars.len() { Some(chars[i + 1]) } else { None };
|
||
|
||
let ffd_glyph_idx = char_to_glyph.get(i).copied().unwrap_or(usize::MAX);
|
||
|
||
// Skip if we couldn't map this character to a glyph
|
||
if ffd_glyph_idx == usize::MAX || ffd_glyph_idx >= neighbor_glyphs.len() {
|
||
result.push('\u{FFFD}');
|
||
continue;
|
||
}
|
||
|
||
// Check if U+FFFD is in a ligature context
|
||
// Ligature patterns:
|
||
// 1. f<U+FFFD>i -> fi
|
||
// 2. f<U+FFFD>l -> fl
|
||
// 3. ff<U+FFFD>i -> ffi
|
||
// 4. ff<U+FFFD>l -> ffl
|
||
// 5. f<U+FFFD>f -> ff (less common)
|
||
// 6. <U+FFFD>i after f -> fi (U+FFFD represents the ligature)
|
||
// 7. <U+FFFD>l after f -> fl
|
||
|
||
let mut ligature: Option<Ligature> = None;
|
||
|
||
// Pattern 1-2: f<U+FFFD>i or f<U+FFFD>l
|
||
if prev_char == Some('f') {
|
||
// Check position adjacency between 'f' glyph and U+FFFD glyph
|
||
let prev_glyph_idx = char_to_glyph.get(i - 1).copied().unwrap_or(usize::MAX);
|
||
let is_adjacent = if prev_glyph_idx != usize::MAX && prev_glyph_idx + 1 == ffd_glyph_idx {
|
||
// Consecutive glyphs - check bbox gap
|
||
let gap = neighbor_glyphs[ffd_glyph_idx].bbox[0] - neighbor_glyphs[prev_glyph_idx].bbox[2];
|
||
gap < LIGATURE_GAP_THRESHOLD
|
||
} else {
|
||
false
|
||
};
|
||
|
||
if is_adjacent {
|
||
// Determine ligature type based on next character
|
||
match next_char {
|
||
Some('i') => ligature = Some(Ligature::Fi),
|
||
Some('l') => ligature = Some(Ligature::Fl),
|
||
Some('f') => {
|
||
// Could be ff or start of ffi/ffl - check character after next
|
||
if i + 2 < chars.len() {
|
||
match chars[i + 2] {
|
||
'i' | 'l' => {
|
||
// f<U+FFFD>f followed by i/l - ambiguous
|
||
// For v0.1.0, treat as ff
|
||
ligature = Some(Ligature::Ff);
|
||
}
|
||
_ => ligature = Some(Ligature::Ff),
|
||
}
|
||
} else {
|
||
ligature = Some(Ligature::Ff);
|
||
}
|
||
}
|
||
_ => {
|
||
// f<U+FFFD> with no following i/l/f - might still be a ligature
|
||
// Use shape or position hint if available
|
||
// For v0.1.0, conservative: don't repair
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Pattern 3-4: ff<U+FFFD>i or ff<U+FFFD>l
|
||
if ligature.is_none() && i >= 2 && chars[i - 2] == 'f' && chars[i - 1] == 'f' {
|
||
let prev_glyph_idx = char_to_glyph.get(i - 1).copied().unwrap_or(usize::MAX);
|
||
let is_adjacent = if prev_glyph_idx != usize::MAX && prev_glyph_idx + 1 == ffd_glyph_idx {
|
||
let gap = neighbor_glyphs[ffd_glyph_idx].bbox[0] - neighbor_glyphs[prev_glyph_idx].bbox[2];
|
||
gap < LIGATURE_GAP_THRESHOLD
|
||
} else {
|
||
false
|
||
};
|
||
|
||
if is_adjacent {
|
||
match next_char {
|
||
Some('i') => ligature = Some(Ligature::Ffi),
|
||
Some('l') => ligature = Some(Ligature::Ffl),
|
||
_ => {}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Pattern 6-7: U+FFFD represents the entire ligature glyph
|
||
// Previous char is f, and U+FFFD glyph is positioned right after it
|
||
// But the next text character is NOT part of the ligature
|
||
// This is harder to detect - would need shape matching
|
||
// For v0.1.0, we only handle patterns 1-4
|
||
|
||
if let Some(lig) = ligature {
|
||
result.push_str(lig.decomposed());
|
||
modified = true;
|
||
} else {
|
||
result.push('\u{FFFD}');
|
||
}
|
||
}
|
||
|
||
if modified {
|
||
span.text = result;
|
||
// Update confidence_source to Heuristic since we used heuristic repair
|
||
span.confidence_source = crate::confidence::ConfidenceSource::Heuristic;
|
||
}
|
||
|
||
modified
|
||
}
|
||
|
||
/// Test implementation of `HasBBox` for unit tests.
|
||
#[cfg(test)]
|
||
#[derive(Debug, Clone)]
|
||
pub struct TestSpan {
|
||
/// Text content of the span.
|
||
pub text: String,
|
||
/// Bounding box of the span [x0, y0, x1, y1].
|
||
pub bbox: [f64; 4],
|
||
}
|
||
|
||
#[cfg(test)]
|
||
impl TestSpan {
|
||
/// Create a new test span with text and bounding box.
|
||
pub fn new(text: impl Into<String>, bbox: [f64; 4]) -> Self {
|
||
Self {
|
||
text: text.into(),
|
||
bbox,
|
||
}
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
impl HasBBox for TestSpan {
|
||
fn bbox(&self) -> [f64; 4] {
|
||
self.bbox
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
impl CorrectableText for TestSpan {
|
||
fn text_mut(&mut self) -> &mut String {
|
||
&mut self.text
|
||
}
|
||
|
||
fn text(&self) -> &str {
|
||
&self.text
|
||
}
|
||
}
|
||
|
||
/// Test implementation of `Line` for unit tests.
|
||
#[cfg(test)]
|
||
#[derive(Debug, Clone)]
|
||
pub struct TestLine {
|
||
/// Spans in this line.
|
||
pub spans: Vec<TestSpan>,
|
||
/// Column index for this line (if multi-column).
|
||
pub column: Option<usize>,
|
||
}
|
||
|
||
#[cfg(test)]
|
||
impl Default for TestLine {
|
||
fn default() -> Self {
|
||
Self {
|
||
spans: Vec::new(),
|
||
column: None,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Test implementation of `Block` for unit tests.
|
||
#[cfg(test)]
|
||
pub struct TestBlock {
|
||
/// Lines in this block.
|
||
pub lines: Vec<TestLine>,
|
||
/// Column index for this block.
|
||
pub column: usize,
|
||
}
|
||
|
||
#[cfg(test)]
|
||
impl TestBlock {
|
||
/// Create a new test block with lines and column index.
|
||
pub fn new(lines: Vec<TestLine>, column: usize) -> Self {
|
||
Self { lines, column }
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use crate::layout::line::{Block, Line, LineDirection};
|
||
use std::sync::Arc;
|
||
|
||
/// Helper to create a test Line with a single span.
|
||
#[cfg(test)]
|
||
fn make_test_line(text: &str, bbox: [f32; 4], column: Option<usize>) -> Line<TestSpan> {
|
||
Line {
|
||
spans: vec![TestSpan::new(
|
||
text,
|
||
[
|
||
bbox[0] as f64,
|
||
bbox[1] as f64,
|
||
bbox[2] as f64,
|
||
bbox[3] as f64,
|
||
],
|
||
)],
|
||
bbox,
|
||
baseline: bbox[1],
|
||
direction: LineDirection::Ltr,
|
||
page_relative_y: 0.5,
|
||
median_font_size: 12.0,
|
||
rendering_mode: None,
|
||
column,
|
||
}
|
||
}
|
||
use super::*;
|
||
|
||
/// Simple mock scorer that returns 1.0 for clean text, 0.3 for mojibake.
|
||
fn simple_scorer(text: &str) -> f32 {
|
||
// Check for common mojibake patterns
|
||
if text.contains("\u{00c3}\u{00a9}") || // é
|
||
text.contains("\u{00c3}\u{00a8}") || // è
|
||
text.contains("\u{00e2}\u{20ac}\u{2122}")
|
||
{
|
||
// ’ (smart quote)
|
||
0.3
|
||
} else {
|
||
0.9
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_clean_utf8_no_change() {
|
||
// Clean UTF-8 text: no mojibake sequences
|
||
let mut span = TestSpan::new("caf\u{00e9}", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "caf\u{00e9}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ascii_only_no_change() {
|
||
// ASCII-only text: cannot be mojibake
|
||
let mut span = TestSpan::new("hello world", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "hello world");
|
||
}
|
||
|
||
#[test]
|
||
fn test_empty_string_no_change() {
|
||
let mut span = TestSpan::new("", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_mojibake_detected_and_repaired() {
|
||
// "café" is mojibake for "café" - Latin-1 interpreted as UTF-8
|
||
// In UTF-8, é is 0xC3 0xA9. If those bytes are interpreted as windows-1252,
|
||
// we get "é". Re-encoding those as UTF-8 bytes and decoding as windows-1252
|
||
// should recover the original "é".
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}", [0.0, 0.0, 100.0, 20.0]); // café
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(repaired);
|
||
assert_eq!(span.text(), "caf\u{00e9}"); // café
|
||
}
|
||
|
||
#[test]
|
||
fn test_mojibake_multiple_indicators() {
|
||
// Multiple indicators: éè (café + è)
|
||
let mut span = TestSpan::new(
|
||
"caf\u{00c3}\u{00a9} r\u{00c3}\u{00a8}st\u{00c3}\u{00a9}",
|
||
[0.0, 0.0, 200.0, 20.0],
|
||
);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(repaired);
|
||
// Should re-decode to "café résté"
|
||
assert_eq!(span.text(), "caf\u{00e9} r\u{00e9}st\u{00e9}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_mojibake_single_indicator_threshold() {
|
||
// Single é without other indicators: below threshold
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}sandbar", [0.0, 0.0, 200.0, 20.0]);
|
||
// With only 1 é, the threshold of 2 is not met
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(!repaired); // Should not detect with only 1 indicator
|
||
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}sandbar");
|
||
}
|
||
|
||
#[test]
|
||
fn test_smart_quote_mojibake() {
|
||
// Smart quote mojibake
|
||
let mojibake = "don\u{2019}t"; // don't with curly apostrophe
|
||
let mut span = TestSpan::new(mojibake, [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired =
|
||
detect_and_repair_mojibake(
|
||
&mut span,
|
||
|s| {
|
||
if s.contains("\u{2019}") {
|
||
0.3
|
||
} else {
|
||
0.9
|
||
}
|
||
},
|
||
);
|
||
assert!(repaired);
|
||
assert_eq!(span.text(), "don't");
|
||
}
|
||
|
||
#[test]
|
||
fn test_em_dash_mojibake() {
|
||
// em dash mojibake test
|
||
let mojibake = "hello\u{2014}world"; // â€" pattern
|
||
let mut span = TestSpan::new(mojibake, [0.0, 0.0, 200.0, 20.0]);
|
||
let repaired =
|
||
detect_and_repair_mojibake(
|
||
&mut span,
|
||
|s| {
|
||
if s.contains("\u{2014}") {
|
||
0.3
|
||
} else {
|
||
0.9
|
||
}
|
||
},
|
||
);
|
||
assert!(repaired);
|
||
// Should decode to proper em dash
|
||
assert!(span.text().contains("\u{2014}"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_replacement_rejected_if_score_doesnt_improve() {
|
||
// Even with mojibake indicators, don't replace if score doesn't improve
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, |_| 0.5); // Both score 0.5
|
||
// No replacement because candidate_score (0.5) is not > original_score (0.5) + 0.05
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_epsilon_threshold_prevents_noise() {
|
||
// Candidate score only slightly better - should be rejected
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, |s| {
|
||
if s.contains("\u{00c3}\u{00a9}") {
|
||
0.7
|
||
} else {
|
||
0.74
|
||
} // Only 0.04 improvement
|
||
});
|
||
// 0.74 is not > 0.7 + 0.05 (0.75), so no replacement
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "caf\u{00c3}\u{00a9}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_asian_text_unaffected() {
|
||
// Asian text (no Latin-1 indicators): pass-through
|
||
let mut span = TestSpan::new("こんにちは世界", [0.0, 0.0, 200.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text(), "こんにちは世界");
|
||
}
|
||
|
||
#[test]
|
||
fn test_windows1252_specific() {
|
||
// Test that we use windows-1252, not pure Latin-1
|
||
// Smart quote is the windows-1252 smart quote, not in pure Latin-1
|
||
let mojibake = "it\u{2019}s"; // it's with smart quote
|
||
let mut span = TestSpan::new(mojibake, [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired =
|
||
detect_and_repair_mojibake(
|
||
&mut span,
|
||
|s| {
|
||
if s.contains("\u{2019}") {
|
||
0.3
|
||
} else {
|
||
0.9
|
||
}
|
||
},
|
||
);
|
||
assert!(repaired);
|
||
assert_eq!(span.text(), "it's");
|
||
}
|
||
|
||
#[test]
|
||
fn test_mixed_ascii_and_mojibake() {
|
||
// Mixed content: some ASCII, some mojibake
|
||
let mut span = TestSpan::new(
|
||
"The word is caf\u{00e9} and r\u{00e9}sum\u{00e9}",
|
||
[0.0, 0.0, 400.0, 20.0],
|
||
);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(repaired);
|
||
assert_eq!(
|
||
span.text(),
|
||
"The word is caf\u{00e9} and r\u{00e9}sum\u{00e9}"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_nbsp_indicator() {
|
||
// NBSP pattern: \u{00a0} followed by non-ASCII
|
||
let mut span = TestSpan::new("hello\u{00a0} world\u{00a0} here", [0.0, 0.0, 200.0, 20.0]);
|
||
let repaired =
|
||
detect_and_repair_mojibake(
|
||
&mut span,
|
||
|s| {
|
||
if s.contains("\u{00a0} ") {
|
||
0.3
|
||
} else {
|
||
0.9
|
||
}
|
||
},
|
||
);
|
||
assert!(repaired);
|
||
// NBSP + space should be handled
|
||
assert!(!span.text().contains("\u{00a0} "));
|
||
}
|
||
|
||
#[test]
|
||
fn test_multiple_mojibake_patterns() {
|
||
// Multiple different indicators: curly quote + accent
|
||
let mojibake = "don\u{2019}t drink caf\u{00e9}";
|
||
let mut span = TestSpan::new(mojibake, [0.0, 0.0, 200.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, simple_scorer);
|
||
assert!(repaired);
|
||
assert_eq!(span.text(), "don't drink caf\u{00e9}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_exact_epsilon_boundary() {
|
||
// Test the exact epsilon boundary
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, |s| {
|
||
if s.contains("\u{00c3}\u{00a9}") {
|
||
0.70
|
||
} else {
|
||
0.75
|
||
} // Exactly 0.05 improvement
|
||
});
|
||
// 0.75 is NOT > 0.70 + 0.05 (0.75), so no replacement (strict inequality)
|
||
assert!(!repaired);
|
||
}
|
||
|
||
#[test]
|
||
fn test_just_above_epsilon() {
|
||
// Just above epsilon threshold
|
||
let mut span = TestSpan::new("caf\u{00c3}\u{00a9}", [0.0, 0.0, 100.0, 20.0]);
|
||
let repaired = detect_and_repair_mojibake(&mut span, |s| {
|
||
if s.contains("\u{00c3}\u{00a9}") {
|
||
0.70
|
||
} else {
|
||
0.751
|
||
} // 0.051 improvement
|
||
});
|
||
// 0.751 > 0.70 + 0.05 (0.75), so replacement happens
|
||
assert!(repaired);
|
||
assert_eq!(span.text(), "caf\u{00e9}");
|
||
}
|
||
|
||
// ===== Hyphenation repair tests =====
|
||
|
||
#[test]
|
||
fn test_hyphenation_join_basic() {
|
||
// Basic hyphenation join: "hyphen-" + "ation" -> "hyphenation"
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation continues", [50.0, 85.0, 200.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 1);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
assert_eq!(block.lines[1].spans[0].text(), "continues");
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_capital_start_no_join() {
|
||
// Capital start of next line: NOT a continuation
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("More text", [50.0, 85.0, 200.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 0);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphen-");
|
||
assert_eq!(block.lines[1].spans[0].text(), "More text");
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_not_at_right_edge() {
|
||
// Hyphen not at right edge: NOT joined
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 300.0, 115.0], Some(0)), // Not at right edge
|
||
make_test_line("ation continues", [50.0, 85.0, 200.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 300.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 0);
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_different_columns() {
|
||
// Lines in different columns: NOT joined
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation continues", [300.0, 85.0, 450.0, 100.0], Some(1)), // Different column
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 450.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 0);
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_soft_hyphen() {
|
||
// Soft hyphen (U+00AD) should be detected and stripped
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen\u{00AD}", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation continues", [50.0, 85.0, 200.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 1);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_non_breaking_hyphen() {
|
||
// Non-breaking hyphen (U+2011) should be detected and stripped
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen\u{2011}", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation continues", [50.0, 85.0, 200.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 1);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_empty_span_removed() {
|
||
// When next span becomes empty after removing first word, it should be removed
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation", [50.0, 85.0, 100.0, 100.0], Some(0)), // Only the continuation word
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 1);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
// Next line should be removed (span became empty, then line became empty)
|
||
assert_eq!(block.lines.len(), 1);
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_multi_word_continuation() {
|
||
// Continuation line has multiple words: only first word should be moved
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("Long hyphen-", [50.0, 100.0, 445.0, 115.0], Some(0)),
|
||
make_test_line("ation continues here", [50.0, 85.0, 300.0, 100.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 85.0, 445.0, 115.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 1);
|
||
assert_eq!(block.lines[0].spans[0].text(), "Long hyphenation ");
|
||
assert_eq!(block.lines[1].spans[0].text(), "continues here");
|
||
}
|
||
|
||
#[test]
|
||
fn test_hyphenation_multiple_repairs() {
|
||
// Multiple hyphenation repairs in the same block
|
||
let mut block = Block {
|
||
lines: vec![
|
||
make_test_line("First hyphen-", [50.0, 200.0, 445.0, 215.0], Some(0)),
|
||
make_test_line("ation here", [50.0, 180.0, 200.0, 195.0], Some(0)),
|
||
make_test_line("Second hyphen-", [50.0, 150.0, 445.0, 165.0], Some(0)),
|
||
make_test_line("ation there", [50.0, 130.0, 200.0, 145.0], Some(0)),
|
||
],
|
||
kind: "paragraph".to_string(),
|
||
text: String::new(),
|
||
bbox: [50.0, 130.0, 445.0, 215.0],
|
||
median_font_size: 12.0,
|
||
column: 0,
|
||
};
|
||
|
||
let count = repair_hyphenation(&mut block, 500.0);
|
||
assert_eq!(count, 2);
|
||
assert_eq!(block.lines[0].spans[0].text(), "First hyphenation ");
|
||
assert_eq!(block.lines[1].spans[0].text(), "here");
|
||
assert_eq!(block.lines[2].spans[0].text(), "Second hyphenation ");
|
||
assert_eq!(block.lines[3].spans[0].text(), "there");
|
||
}
|
||
|
||
// ===== Script detection tests =====
|
||
|
||
#[test]
|
||
fn test_detect_script_latin() {
|
||
// Latin/ASCII text
|
||
assert_eq!(detect_script("Hello world"), Script::Latin);
|
||
assert_eq!(detect_script("The quick brown fox"), Script::Latin);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_arabic() {
|
||
// Arabic text
|
||
assert_eq!(detect_script("مرحبا"), Script::Arabic);
|
||
assert_eq!(detect_script("السلام عليكم"), Script::Arabic);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_hebrew() {
|
||
// Hebrew text
|
||
assert_eq!(detect_script("שלום"), Script::Hebrew);
|
||
assert_eq!(detect_script("מה נשמע"), Script::Hebrew);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_devanagari() {
|
||
// Devanagari text (Hindi)
|
||
assert_eq!(detect_script("नमस्ते"), Script::Devanagari);
|
||
assert_eq!(detect_script("धन्यवाद"), Script::Devanagari);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_bengali() {
|
||
// Bengali text
|
||
assert_eq!(detect_script("হ্যালো"), Script::Bengali);
|
||
assert_eq!(detect_script("ধন্যবাদ"), Script::Bengali);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_thai() {
|
||
// Thai text
|
||
assert_eq!(detect_script("สวัสดี"), Script::Thai);
|
||
assert_eq!(detect_script("ขอบคุณ"), Script::Thai);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_empty() {
|
||
// Empty text
|
||
assert_eq!(detect_script(""), Script::Unknown);
|
||
}
|
||
|
||
#[test]
|
||
fn test_detect_script_mixed_latin_arabic() {
|
||
// Mixed text - Arabic wins with threshold
|
||
assert_eq!(detect_script("Hello مرحبا"), Script::Arabic);
|
||
}
|
||
|
||
// ===== Word-break normalization tests =====
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_latin_zero_width_space() {
|
||
// AC: "auto\u{200B}mation" (Latin) -> "automation" (1 stripped, U+200B)
|
||
let mut span = Span::empty();
|
||
span.text = String::from("auto\u{200B}mation");
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Latin));
|
||
assert_eq!(count, 3); // U+200B is 3 bytes in UTF-8
|
||
assert_eq!(span.text, "automation");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_latin_bom() {
|
||
// AC: Mixed BOM "\u{FEFF}hello" -> "hello" (always stripped)
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{FEFF}hello");
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Latin));
|
||
assert_eq!(count, 3); // U+FEFF is 3 bytes in UTF-8
|
||
assert_eq!(span.text, "hello");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_latin_zwnj_zwj() {
|
||
// Latin text: ZWNJ/ZWJ should be stripped
|
||
let mut span = Span::empty();
|
||
span.text = String::from("test\u{200C}\u{200D}case");
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Latin));
|
||
assert_eq!(count, 6); // Each is 3 bytes in UTF-8
|
||
assert_eq!(span.text, "testcase");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_arabic_preserves_zwnj_zwj() {
|
||
// AC: Arabic "ای\u{200C}\u{200D}" with script_hint=Arabic -> unchanged
|
||
// Note: Using a simpler Arabic example since "ای" requires specific characters
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0627}\u{200C}\u{200D}"); // alef + ZWNJ + ZWJ
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Arabic));
|
||
assert_eq!(count, 0);
|
||
assert_eq!(span.text, "\u{0627}\u{200C}\u{200D}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_arabic_strips_zw_space() {
|
||
// Arabic text: U+200B should still be stripped even in Arabic
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0627}\u{200B}\u{0628}"); // alef + ZWSP + beh
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Arabic));
|
||
assert_eq!(count, 3); // U+200B is 3 bytes in UTF-8
|
||
assert_eq!(span.text, "\u{0627}\u{0628}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_arabic_strips_bom() {
|
||
// Arabic text: U+FEFF should still be stripped even in Arabic
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{FEFF}\u{0627}\u{0628}"); // BOM + alef + beh
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Arabic));
|
||
assert_eq!(count, 3); // U+FEFF is 3 bytes in UTF-8
|
||
assert_eq!(span.text, "\u{0627}\u{0628}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_unknown_script_strips_all() {
|
||
// AC: Arabic same with script_hint=None -> stripped (default-strip)
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0627}\u{200C}\u{200D}");
|
||
let count = normalize_word_breaks(&mut span, None);
|
||
assert_eq!(count, 6); // Both ZWNJ and ZWJ stripped
|
||
assert_eq!(span.text, "\u{0627}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_devanagari_preserves_zwnj_zwj() {
|
||
// AC: Devanagari "क\u{200D}ष" with script_hint=Devanagari -> unchanged
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0915}\u{200D}\u{0937}"); // ka + ZWJ + ssa
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Devanagari));
|
||
assert_eq!(count, 0);
|
||
assert_eq!(span.text, "\u{0915}\u{200D}\u{0937}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_devanagari_strips_zw_space() {
|
||
// Devanagari text: U+200B should still be stripped
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0915}\u{200B}\u{0937}");
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Devanagari));
|
||
assert_eq!(count, 3); // U+200B is 3 bytes
|
||
assert_eq!(span.text, "\u{0915}\u{0937}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_auto_detect_latin() {
|
||
// Auto-detect Latin text
|
||
let mut span = Span::empty();
|
||
span.text = String::from("test\u{200C}\u{200D}");
|
||
let count = normalize_word_breaks(&mut span, None);
|
||
assert_eq!(count, 6);
|
||
assert_eq!(span.text, "test");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_auto_detect_arabic() {
|
||
// Auto-detect Arabic text and preserve ZWNJ/ZWJ
|
||
let mut span = Span::empty();
|
||
span.text = String::from("مرحبا\u{200C}"); // Arabic + ZWNJ
|
||
let count = normalize_word_breaks(&mut span, None);
|
||
assert_eq!(count, 0);
|
||
assert_eq!(span.text, "مرحبا\u{200C}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_auto_detect_devanagari() {
|
||
// Auto-detect Devanagari text and preserve ZWNJ/ZWJ
|
||
let mut span = Span::empty();
|
||
span.text = String::from("नमस्ते\u{200D}"); // Devanagari + ZWJ
|
||
let count = normalize_word_breaks(&mut span, None);
|
||
assert_eq!(count, 0);
|
||
assert_eq!(span.text, "नमस्ते\u{200D}");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_empty_span() {
|
||
// Empty span: no changes
|
||
let mut span = Span::empty();
|
||
span.text = String::from("");
|
||
let count = normalize_word_breaks(&mut span, None);
|
||
assert_eq!(count, 0);
|
||
assert_eq!(span.text, "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_multiple_zero_width_chars() {
|
||
// Multiple zero-width characters in Latin text
|
||
let mut span = Span::empty();
|
||
span.text = String::from("a\u{200B}b\u{200C}c\u{200D}d\u{FEFF}e");
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Latin));
|
||
assert_eq!(count, 12); // 4 chars * 3 bytes each
|
||
assert_eq!(span.text, "abcde");
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_hebrew_preserves_joiners() {
|
||
// Hebrew text: ZWNJ/ZWJ should be preserved
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{05E9}\u{05DC}\u{200C}\u{05D5}\u{05DD}"); // shalom with ZWNJ
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Hebrew));
|
||
assert_eq!(count, 0);
|
||
assert!(span.text.contains("\u{200C}"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_thai_preserves_joiners() {
|
||
// Thai text: ZWNJ/ZWJ should be preserved
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0E2A}\u{0E27}\u{0E31}\u{0E12}\u{200D}"); // sawasdee with ZWJ
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Thai));
|
||
assert_eq!(count, 0);
|
||
assert!(span.text.contains("\u{200D}"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_bengali_preserves_joiners() {
|
||
// Bengali text: ZWNJ/ZWJ should be preserved
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0985}\u{09BE}\u{200C}"); // a with ZWNJ
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Bengali));
|
||
assert_eq!(count, 0);
|
||
assert!(span.text.contains("\u{200C}"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_normalize_word_breaks_indic_preserves_joiners() {
|
||
// Indic text (Tamil): ZWNJ/ZWJ should be preserved
|
||
let mut span = Span::empty();
|
||
span.text = String::from("\u{0B85}\u{0BBE}\u{200D}"); // Tamil a with ZWJ
|
||
let count = normalize_word_breaks(&mut span, Some(Script::Indic));
|
||
assert_eq!(count, 0);
|
||
assert!(span.text.contains("\u{200D}"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_script_preserves_joiners_arabic() {
|
||
// Test Script::Arabic.preserves_joiners()
|
||
assert!(Script::Arabic.preserves_joiners());
|
||
}
|
||
|
||
#[test]
|
||
fn test_script_preserves_joiners_latin() {
|
||
// Test Script::Latin.preserves_joiners()
|
||
assert!(!Script::Latin.preserves_joiners());
|
||
}
|
||
|
||
#[test]
|
||
fn test_script_preserves_joiners_all_complex_scripts() {
|
||
// All complex scripts should preserve joiners
|
||
assert!(Script::Arabic.preserves_joiners());
|
||
assert!(Script::Hebrew.preserves_joiners());
|
||
assert!(Script::Devanagari.preserves_joiners());
|
||
assert!(Script::Bengali.preserves_joiners());
|
||
assert!(Script::Indic.preserves_joiners());
|
||
assert!(Script::Thai.preserves_joiners());
|
||
assert!(Script::Lao.preserves_joiners());
|
||
assert!(Script::Tibetan.preserves_joiners());
|
||
assert!(Script::Myanmar.preserves_joiners());
|
||
assert!(Script::Khmer.preserves_joiners());
|
||
assert!(Script::Sinhala.preserves_joiners());
|
||
}
|
||
|
||
#[test]
|
||
fn test_script_preserves_joiners_simple_scripts() {
|
||
// Simple scripts should NOT preserve joiners
|
||
assert!(!Script::Latin.preserves_joiners());
|
||
assert!(!Script::Unknown.preserves_joiners());
|
||
}
|
||
|
||
// ===== Ligature repair tests =====
|
||
|
||
#[test]
|
||
fn test_ligature_repair_fi_adjacent() {
|
||
// AC: U+FFFD adjacent to 'i', gap 0.05pt: repaired to "fi" by shape
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}ect");
|
||
|
||
// Create glyphs: 'f' at [0,0,5,10], U+FFFD at [5.05,0,10,10], 'e' at [10,0,15,10]
|
||
// The gap between 'f' and U+FFFD is 0.05pt < 0.1pt threshold
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.05, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(repaired, "Should repair f + U+FFFD to 'fi'");
|
||
assert_eq!(span.text, "fiect", "Should replace f + U+FFFD with 'fi'");
|
||
assert_eq!(span.confidence_source, crate::confidence::ConfidenceSource::Heuristic);
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_no_adjacent_ligature() {
|
||
// AC: U+FFFD with no nearby f/l/i: not repaired
|
||
let mut span = Span::empty();
|
||
span.text = String::from("abc\u{FFFD}def");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('a', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('b', UnicodeSource::ToUnicode, 1.0, [5.0, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('c', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [15.0, 0.0, 20.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('d', UnicodeSource::ToUnicode, 1.0, [20.0, 0.0, 25.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(!repaired, "Should not repair when U+FFFD is not adjacent to f/l/i");
|
||
assert_eq!(span.text, "abc\u{FFFD}def", "Text should remain unchanged");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_gap_too_large() {
|
||
// U+FFFD adjacent to 'f' but gap > 0.1pt: not repaired
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}ect");
|
||
|
||
// Create glyphs with gap 0.2pt > 0.1pt threshold
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.2, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(!repaired, "Should not repair when gap exceeds threshold");
|
||
assert_eq!(span.text, "f\u{FFFD}ect", "Text should remain unchanged");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_fl_ligature() {
|
||
// Test fl ligature repair: f<U+FFFD>l -> fl
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}y");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.05, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('y', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
// This won't repair because 'y' is not 'l' - need proper test data
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(!repaired, "Should not repair without 'l' following");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_fl_with_l_following() {
|
||
// Test fl ligature repair with actual 'l' following: f<U+FFFD>l -> fl
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}l");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.05, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(repaired, "Should repair f + U+FFFD + l to 'fl'");
|
||
assert_eq!(span.text, "fl", "Should replace f + U+FFFD + l with 'fl'");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_multiple_fffd() {
|
||
// Multiple U+FFFD in span: each evaluated independently
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}rst and f\u{FFFD}l");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.05, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('r', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [40.0, 0.0, 45.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [45.05, 0.0, 50.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [50.0, 0.0, 55.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
// First U+FFFD not followed by i/l, so not repaired
|
||
// Second U+FFFD followed by 'l', so repaired to 'fl'
|
||
assert!(repaired, "Should repair at least one ligature");
|
||
assert_eq!(span.text, "f\u{FFFD}rst and fl", "Second ligature repaired");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_empty_span() {
|
||
// Empty span: no repairs
|
||
let mut span = Span::empty();
|
||
span.text = String::from("");
|
||
let glyphs = vec![];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text, "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_no_fffd() {
|
||
// Span without U+FFFD: fast-path returns false
|
||
let mut span = Span::empty();
|
||
span.text = String::from("normal text");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('n', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(!repaired);
|
||
assert_eq!(span.text, "normal text");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_enum_decomposed() {
|
||
// Test Ligature::decomposed() returns correct strings
|
||
assert_eq!(Ligature::Fi.decomposed(), "fi");
|
||
assert_eq!(Ligature::Fl.decomposed(), "fl");
|
||
assert_eq!(Ligature::Ffi.decomposed(), "ffi");
|
||
assert_eq!(Ligature::Ffl.decomposed(), "ffl");
|
||
assert_eq!(Ligature::Ff.decomposed(), "ff");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_is_component() {
|
||
// Test Ligature::is_component() correctly identifies f, l, i
|
||
assert!(Ligature::is_component('f'));
|
||
assert!(Ligature::is_component('l'));
|
||
assert!(Ligature::is_component('i'));
|
||
assert!(!Ligature::is_component('a'));
|
||
assert!(!Ligature::is_component('x'));
|
||
assert!(!Ligature::is_component('\u{FFFD}'));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_ffi_ligature() {
|
||
// Test ffi ligature repair: ff<U+FFFD>i -> ffi
|
||
let mut span = Span::empty();
|
||
span.text = String::from("ff\u{FFFD}i");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [5.0, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [10.05, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('i', UnicodeSource::ToUnicode, 1.0, [15.0, 0.0, 20.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(repaired, "Should repair ff + U+FFFD + i to 'ffi'");
|
||
assert_eq!(span.text, "ffi", "Should replace ff + U+FFFD + i with 'ffi'");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_ffl_ligature() {
|
||
// Test ffl ligature repair: ff<U+FFFD>l -> ffl
|
||
let mut span = Span::empty();
|
||
span.text = String::from("ff\u{FFFD}l");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [5.0, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [10.05, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [15.0, 0.0, 20.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(repaired, "Should repair ff + U+FFFD + l to 'ffl'");
|
||
assert_eq!(span.text, "ffl", "Should replace ff + U+FFFD + l with 'ffl'");
|
||
}
|
||
|
||
#[test]
|
||
fn test_ligature_repair_ff_ligature() {
|
||
// Test ff ligature repair: f<U+FFFD>f -> ff
|
||
let mut span = Span::empty();
|
||
span.text = String::from("f\u{FFFD}ft");
|
||
|
||
let glyphs = vec![
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [0.0, 0.0, 5.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [5.05, 0.0, 10.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, [10.0, 0.0, 15.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
Glyph::new('t', UnicodeSource::ToUnicode, 1.0, [15.0, 0.0, 20.0, 10.0],
|
||
Arc::from("Helvetica"), 12.0, 0, crate::graphics_state::Color::DeviceGray(0.0), false, None, false),
|
||
];
|
||
|
||
let repaired = repair_split_ligatures(&mut span, &glyphs);
|
||
assert!(repaired, "Should repair f + U+FFFD + f to 'ff'");
|
||
assert_eq!(span.text, "fft", "Should replace f + U+FFFD + f with 'ff'");
|
||
}
|
||
}
|