diff --git a/crates/pdftract-core/src/layout/correction.rs b/crates/pdftract-core/src/layout/correction.rs index 8e9fef0..cbe7164 100644 --- a/crates/pdftract-core/src/layout/correction.rs +++ b/crates/pdftract-core/src/layout/correction.rs @@ -4,6 +4,7 @@ //! before readability scoring. Corrections include: //! - Mojibake detection and repair (Latin-1 interpreted as UTF-8) //! - Hyphenation repair (end-of-line hyphen joined with next line) +//! - Word-break normalization (zero-width characters stripped or preserved per script) //! //! # Mojibake Detection //! @@ -15,6 +16,270 @@ use encoding_rs::WINDOWS_1252; use crate::layout::line::{Block, Line, LineMetadata}; +use crate::span::Span; + +/// Unicode script category for word-break normalization. +/// +/// Simplified script detection based on Unicode codepoint ranges. +/// Used to determine whether zero-width joiner/non-joiner characters +/// should be preserved (they're orthographic in complex scripts) or +/// stripped (they're noise in Latin text). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Script { + /// Arabic script - requires ZWNJ/ZWJ for correct rendering + Arabic, + /// Hebrew script - may use ZWNJ/ZWJ + Hebrew, + /// Devanagari (Hindi, Marathi, Nepali, Sanskrit) - requires ZWNJ/ZWJ for conjuncts + Devanagari, + /// Bengali script - requires ZWNJ/ZWJ for conjuncts + Bengali, + /// Other Indic scripts (Gurmukhi, Gujarati, Tamil, Telugu, Kannada, Malayalam, Odia) + Indic, + /// Thai script - may use ZWNJ/ZWJ + Thai, + /// Lao script + Lao, + /// Tibetan script + Tibetan, + /// Myanmar (Burmese) script + Myanmar, + /// Khmer script + Khmer, + /// Sinhala script + Sinhala, + /// Latin and other simple scripts - ZWNJ/ZWJ are noise + Latin, + /// Unknown script - default to strip (safe default) + Unknown, +} + +impl Script { + /// Returns true if this script uses ZWNJ/ZWJ for legitimate orthographic purposes. + /// + /// Complex scripts like Arabic, Indic, and Southeast Asian scripts use + /// zero-width joiner/non-joiner characters to control ligature formation + /// and conjunct rendering. Stripping these breaks the text. + pub fn preserves_joiners(self) -> bool { + matches!( + self, + Self::Arabic + | Self::Hebrew + | Self::Devanagari + | Self::Bengali + | Self::Indic + | Self::Thai + | Self::Lao + | Self::Tibetan + | Self::Myanmar + | Self::Khmer + | Self::Sinhala + ) + } +} + +/// Detect the dominant script from text content. +/// +/// Scans the text and returns the first script category that matches +/// a significant number of characters. Returns `Script::Latin` for +/// ASCII/Latin text and `Script::Unknown` for empty text. +/// +/// # Detection Priority +/// +/// Scripts are checked in priority order (Arabic first, then Devanagari, etc.). +/// The first script with >=3 matching characters is returned. If no script +/// reaches the threshold, returns `Script::Latin` as a safe default. +/// +/// # Examples +/// +/// ``` +/// use pdftract_core::layout::correction::detect_script; +/// +/// assert_eq!(detect_script("Hello world"), Script::Latin); +/// assert_eq!(detect_script("مرحبا"), Script::Arabic); +/// assert_eq!(detect_script("नमस्ते"), Script::Devanagari); +/// assert_eq!(detect_script(""), Script::Unknown); +/// ``` +pub fn detect_script(text: &str) -> Script { + if text.is_empty() { + return Script::Unknown; + } + + let mut arabic_count = 0; + let mut hebrew_count = 0; + let mut devanagari_count = 0; + let mut bengali_count = 0; + let mut indic_count = 0; + let mut thai_count = 0; + let mut lao_count = 0; + let mut tibetan_count = 0; + let mut myanmar_count = 0; + let mut khmer_count = 0; + let mut sinhala_count = 0; + + for c in text.chars() { + let cp = c as u32; + match cp { + // Arabic: U+0600..U+06FF, U+0750..U+077F, U+08A0..U+08FF + 0x0600..=0x06FF | 0x0750..=0x077F | 0x08A0..=0x08FF => arabic_count += 1, + // Hebrew: U+0590..U+05FF + 0x0590..=0x05FF => hebrew_count += 1, + // Devanagari: U+0900..U+097F + 0x0900..=0x097F => devanagari_count += 1, + // Bengali: U+0980..U+09FF + 0x0980..=0x09FF => bengali_count += 1, + // Other Indic scripts: + // Gurmukhi: U+0A00..U+0A7F + // Gujarati: U+0A80..U+0AFF + // Tamil: U+0B80..U+0BFF + // Telugu: U+0C00..U+0C7F + // Kannada: U+0C80..U+0CFF + // Malayalam: U+0D00..U+0D7F + // Odia: U+0B00..U+0B7F + 0x0A00..=0x0A7F | 0x0A80..=0x0AFF | 0x0B00..=0x0B7F | 0x0B80..=0x0BFF | + 0x0C00..=0x0C7F | 0x0C80..=0x0CFF | 0x0D00..=0x0D7F => indic_count += 1, + // Thai: U+0E00..U+0E7F + 0x0E00..=0x0E7F => thai_count += 1, + // Lao: U+0E80..U+0EFF + 0x0E80..=0x0EFF => lao_count += 1, + // Tibetan: U+0F00..U+0FFF + 0x0F00..=0x0FFF => tibetan_count += 1, + // Myanmar: U+1000..U+109F + 0x1000..=0x109F => myanmar_count += 1, + // Khmer: U+1780..U+17FF + 0x1780..=0x17FF => khmer_count += 1, + // Sinhala: U+0D80..U+0DFF + 0x0D80..=0x0DFF => sinhala_count += 1, + _ => {} + } + } + + const THRESHOLD: usize = 3; + + if arabic_count >= THRESHOLD { + return Script::Arabic; + } + if hebrew_count >= THRESHOLD { + return Script::Hebrew; + } + if devanagari_count >= THRESHOLD { + return Script::Devanagari; + } + if bengali_count >= THRESHOLD { + return Script::Bengali; + } + if indic_count >= THRESHOLD { + return Script::Indic; + } + if thai_count >= THRESHOLD { + return Script::Thai; + } + if lao_count >= THRESHOLD { + return Script::Lao; + } + if tibetan_count >= THRESHOLD { + return Script::Tibetan; + } + if myanmar_count >= THRESHOLD { + return Script::Myanmar; + } + if khmer_count >= THRESHOLD { + return Script::Khmer; + } + if sinhala_count >= THRESHOLD { + return Script::Sinhala; + } + + // Default to Latin for ASCII or undetected scripts + Script::Latin +} + +/// Normalize word-break characters in span text based on script hint. +/// +/// Strips zero-width formatting characters that are noise in extracted text: +/// - **U+200B** (zero-width space): ALWAYS stripped (never content) +/// - **U+FEFF** (zero-width no-break space / BOM): ALWAYS stripped (never content) +/// - **U+200C** (zero-width non-joiner): stripped unless script requires it +/// - **U+200D** (zero-width joiner): stripped unless script requires it +/// +/// The script_hint determines whether ZWNJ/ZWJ are preserved: +/// - **Arabic, Hebrew, Indic, Thai, Lao, Tibetan, Myanmar, Khmer, Sinhala**: +/// ZWNJ/ZWJ are preserved (they control ligature/conjunct formation) +/// - **Latin or Unknown**: All four characters are stripped +/// +/// # Arguments +/// +/// * `span` - Mutable reference to the span to normalize +/// * `script_hint` - Optional script hint; if None, detects from span text +/// +/// # Returns +/// +/// Count of characters stripped (u32). +/// +/// # Invariants +/// +/// - **INV**: U+200B and U+FEFF are NEVER content; always stripped regardless of script. +/// - **INV**: U+200C/U+200D are content in Arabic/Indic; stripping breaks rendering. +/// - **INV**: When script_hint is None, script is detected from the span's own text. +/// - **INV**: For unknown-script text, default to strip (safer for Latin output). +/// +/// # Performance +/// +/// O(n) where n is the length of the span text. Uses `String::retain` with +/// a closure that checks the script hint once. +/// +/// # Examples +/// +/// ``` +/// use pdftract_core::layout::correction::{normalize_word_breaks, Script}; +/// use pdftract_core::span::Span; +/// use std::sync::Arc; +/// +/// // Latin text: all zero-width chars stripped +/// let mut span = Span::empty(); +/// span.text = String::from("auto\u{200B}mation"); +/// let count = normalize_word_breaks(&mut span, Some(Script::Latin)); +/// assert_eq!(count, 1); +/// assert_eq!(span.text, "automation"); +/// +/// // Arabic text: ZWNJ/ZWJ preserved +/// let mut span = Span::empty(); +/// span.text = String::from("\u{0627}\u{200C}\u{200D}"); // alef + ZWNJ + ZWJ +/// let count = normalize_word_breaks(&mut span, Some(Script::Arabic)); +/// assert_eq!(count, 0); +/// assert_eq!(span.text, "\u{0627}\u{200C}\u{200D}"); +/// +/// // Unknown script: all stripped (safe default) +/// let mut span = Span::empty(); +/// span.text = String::from("test\u{200C}\u{200D}"); +/// let count = normalize_word_breaks(&mut span, None); +/// assert_eq!(count, 2); +/// assert_eq!(span.text, "test"); +/// ``` +pub fn normalize_word_breaks(span: &mut Span, script_hint: Option