From 49912434758ac66c6942bcb5abe3c9a5bbc30684 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 23:39:41 -0400 Subject: [PATCH] feat(pdftract-5rmc): implement encoding_rs adapter for CJK encodings Implements decode_cjk_bytes() function wrapping encoding_rs for the four major CJK byte encodings used in legacy PDFs: Shift-JIS, GB18030, Big5, and EUC-KR. Used by Phase 2.3 fallback path when fonts use raw byte encodings instead of proper CMap/ToUnicode mappings. - Add CjkEncoding enum with ShiftJis, Gb18030, Big5, EucKr variants - Implement decode_cjk_bytes(enc, bytes) -> (String, bool) - Use decode_without_bom_handling (PDF byte streams never have BOM) - Return bool indicating malformed bytes for caller to emit diagnostic - Add 15 tests covering valid input, malformed input, empty input, round-trips Supporting changes: - Add encoding_rs dependency (optional, gated by cjk feature) - Add CjkDecodeMalformed diagnostic code - Export CjkEncoding and decode_cjk_bytes from font module Refs: pdftract-5rmc, plan.md Phase 2.3 (lines 1382-1386) Co-Authored-By: Claude Opus 4.7 --- crates/pdftract-core/Cargo.toml | 3 +- crates/pdftract-core/src/diagnostics.rs | 27 ++ crates/pdftract-core/src/font/cjk_encoding.rs | 297 ++++++++++++++++++ crates/pdftract-core/src/font/mod.rs | 6 + notes/pdftract-5rmc.md | 79 +++++ 5 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 crates/pdftract-core/src/font/cjk_encoding.rs create mode 100644 notes/pdftract-5rmc.md diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 14bd9ad..3a58cb0 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -33,6 +33,7 @@ phf = "0.11" tracing = { workspace = true } dashmap = "6.1" smallvec = "1.13" +encoding_rs = { version = "0.8", optional = true } [features] default = ["serde"] @@ -43,7 +44,7 @@ full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (req proptest = [] fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses shape-db = [] # Enable glyph shape database (Level 4 encoding fallback) -cjk = [] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase) +cjk = ["dep:encoding_rs"] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase) [dev-dependencies] chrono = "0.4" diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 1fb2235..0d11295 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -575,6 +575,16 @@ pub enum DiagCode { /// Phase origin: 2.2 FontEncodingDifferenceOutOfRange, + /// Malformed byte sequence in CJK encoding fallback + /// + /// Emitted when a CJK byte encoding (Shift-JIS, GB18030, Big5, or EUC-KR) + /// contains malformed byte sequences. The offending bytes are replaced + /// with U+FFFD (Unicode REPLACEMENT CHARACTER). + /// + /// Phase origin: 2.3 + #[cfg(feature = "cjk")] + CjkDecodeMalformed, + // === OCR_* codes === /// JBIG2 decoder not available @@ -845,6 +855,9 @@ impl DiagCode { | DiagCode::FontCidtogidmapTruncated | DiagCode::FontEncodingDifferenceOutOfRange => "FONT", + #[cfg(feature = "cjk")] + DiagCode::CjkDecodeMalformed => "CJK", + // OCR_* DiagCode::OcrJbig2Unsupported | DiagCode::OcrJpxUnsupported @@ -939,6 +952,8 @@ impl DiagCode { DiagCode::FontUnsupported => "FONT_UNSUPPORTED", DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED", DiagCode::FontEncodingDifferenceOutOfRange => "ENCODING_DIFFERENCE_OUT_OF_RANGE", + #[cfg(feature = "cjk")] + DiagCode::CjkDecodeMalformed => "CJK_DECODE_MALFORMED", DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED", DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED", DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED", @@ -1039,6 +1054,9 @@ impl DiagCode { | DiagCode::CacheEntryCorrupt | DiagCode::CacheWriteFailed => Severity::Warning, + #[cfg(feature = "cjk")] + DiagCode::CjkDecodeMalformed => Severity::Warning, + DiagCode::StreamBomb | DiagCode::PageOutOfRange | DiagCode::RemoteFetchInterrupted @@ -1498,6 +1516,15 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "2.2", suggested_action: "A /Differences array contains a character code outside 0-255; the code was clamped", }, + #[cfg(feature = "cjk")] + DiagInfo { + code: DiagCode::CjkDecodeMalformed, + category: "CJK", + severity: Severity::Warning, + recoverable: true, + phase: "2.3", + suggested_action: "The CJK byte sequence contained malformed bytes, replaced with U+FFFD", + }, // === OCR_* codes === DiagInfo { code: DiagCode::OcrJbig2Unsupported, diff --git a/crates/pdftract-core/src/font/cjk_encoding.rs b/crates/pdftract-core/src/font/cjk_encoding.rs new file mode 100644 index 0000000..64aa049 --- /dev/null +++ b/crates/pdftract-core/src/font/cjk_encoding.rs @@ -0,0 +1,297 @@ +//! CJK byte encoding adapter for raw byte fallback. +//! +//! This module provides a thin wrapper around `encoding_rs` for decoding +//! the four major CJK byte encodings used in legacy PDFs: +//! - Shift-JIS (Japanese) +//! - GB18030 (Chinese) +//! - Big5 (Traditional Chinese, with Big5-HKSCS extension) +//! - EUC-KR (Korean, covering KS X 1001 + Unified Hangul) +//! +//! These are FALLBACK encodings used when: +//! - A font's encoding indicates a raw byte encoding (e.g., /Encoding /ShiftJIS) +//! - No CMap or ToUnicode is present +//! - The lead byte is in a CJK range +//! +//! The primary text extraction path uses predefined CMaps + ToUnicode; this +//! module is only for legacy PDFs that don't provide proper Unicode mappings. + +/// CJK byte encoding identifier. +/// +/// Represents the four major legacy CJK encodings used in PDFs. These are +/// raw byte encodings that need to be decoded to Unicode for text extraction. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CjkEncoding { + /// Shift-JIS (JIS X 0208 + extensions) + /// + /// The most common encoding for Japanese PDFs. Variable-width: 1 byte for + /// ASCII (0x00-0x7F), 2 bytes for JIS X 0208 characters (lead byte 0x81-0x9F, + /// 0xE0-0xEF). + ShiftJis, + + /// GB18030 (Chinese national standard) + /// + /// The mandatory encoding for PRC PDFs. Variable-width: 1, 2, or 4 bytes. + /// Covers all Unicode code points assigned to Chinese characters. + Gb18030, + + /// Big5 (Traditional Chinese, with Big5-HKSCS extension) + /// + /// Common encoding for Traditional Chinese PDFs (Taiwan, Hong Kong). + /// Variable-width: 1 byte for ASCII (0x00-0x7F), 2 bytes for Big5 characters + /// (lead byte 0x81-0xFE). The encoding_rs implementation includes the + /// Big5-HKSCS extension for Hong Kong-specific characters. + Big5, + + /// EUC-KR (KS X 1001 + Unified Hangul) + /// + /// The standard encoding for Korean PDFs. Variable-width: 1 byte for ASCII + /// (0x00-0x7F), 2 bytes for KS X 1001 characters (lead byte 0x81-0xFE). + /// The encoding_rs implementation covers KS X 1001 + Unified Hangul. + EucKr, +} + +impl CjkEncoding { + /// Get the encoding_rs singleton for this encoding. + fn encoding(&self) -> &'static encoding_rs::Encoding { + match self { + CjkEncoding::ShiftJis => encoding_rs::SHIFT_JIS, + CjkEncoding::Gb18030 => encoding_rs::GB18030, + CjkEncoding::Big5 => encoding_rs::BIG5, + CjkEncoding::EucKr => encoding_rs::EUC_KR, + } + } + + /// Get the name of this encoding for diagnostic messages. + pub fn name(&self) -> &'static str { + match self { + CjkEncoding::ShiftJis => "Shift-JIS", + CjkEncoding::Gb18030 => "GB18030", + CjkEncoding::Big5 => "Big5", + CjkEncoding::EucKr => "EUC-KR", + } + } +} + +/// Decode CJK-encoded bytes to a String. +/// +/// This is a fallback path for legacy PDFs that use raw byte encodings instead +/// of proper CMap/ToUnicode mappings. The function uses `encoding_rs` to decode +/// the byte sequence according to the specified encoding. +/// +/// # Arguments +/// +/// * `enc` - The CJK encoding to use for decoding +/// * `bytes` - The raw byte sequence to decode +/// +/// # Returns +/// +/// A tuple `(String, bool)` where: +/// - The `String` is the decoded Unicode text (with U+FFFD for malformed bytes) +/// - The `bool` is `true` if any malformed bytes were encountered, `false` otherwise +/// +/// # Behavior +/// +/// - Empty input returns an empty string with `malformed = false` +/// - Malformed byte sequences are replaced with U+FFFD (Unicode REPLACEMENT CHARACTER) +/// - No panic occurs on any input +/// - PDF byte streams never have a BOM, so we use `decode_without_bom_handling` +/// +/// # Example +/// +/// ``` +/// use pdftract_core::font::cjk_encoding::{decode_cjk_bytes, CjkEncoding}; +/// +/// // Round-trip: encode "テスト" as Shift-JIS bytes, decode -> get "テスト" back +/// let test_str = "テスト"; +/// let shift_jis_bytes = encoding_rs::SHIFT_JIS.encode(test_str); +/// let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &shift_jis_bytes); +/// assert_eq!(decoded, test_str); +/// assert!(!malformed); +/// ``` +pub fn decode_cjk_bytes(enc: CjkEncoding, bytes: &[u8]) -> (String, bool) { + if bytes.is_empty() { + return (String::new(), false); + } + + let encoding = enc.encoding(); + let (cow, had_malformed) = encoding.decode_without_bom_handling(bytes); + + // The encoding_rs decoder already replaces malformed sequences with U+FFFD + // We just need to convert Cow to String and report the malformed status + (cow.into_owned(), had_malformed) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode_shift_jis_valid() { + // "テスト" in Shift-JIS + // 0x83 0x65 = テ, 0x83 0x58 = ス, 0x83 0x67 = ト + let bytes = [0x83, 0x65, 0x83, 0x58, 0x83, 0x67]; + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &bytes); + assert_eq!(decoded, "テスト"); + assert!(!malformed); + } + + #[test] + fn test_decode_gb18030_valid() { + // "中文测试" in GB18030 + // Verify correct encoding by encoding the string first + let test_str = "中文测试"; + let (bytes, _, _) = encoding_rs::GB18030.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_decode_big5_valid() { + // "測試" in Big5 (Traditional Chinese) + // Verify correct encoding by encoding the string first + let test_str = "測試"; + let (bytes, _, _) = encoding_rs::BIG5.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_decode_euc_kr_valid() { + // "한글" in EUC-KR (Korean) + // Verify correct encoding by encoding the string first + let test_str = "한글"; + let (bytes, _, _) = encoding_rs::EUC_KR.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::EucKr, &bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_decode_empty_input() { + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &[]); + assert_eq!(decoded, ""); + assert!(!malformed); + } + + #[test] + fn test_decode_ascii_passthrough() { + // ASCII should pass through unchanged in all encodings + let bytes = b"Hello, World!"; + for enc in &[ + CjkEncoding::ShiftJis, + CjkEncoding::Gb18030, + CjkEncoding::Big5, + CjkEncoding::EucKr, + ] { + let (decoded, malformed) = decode_cjk_bytes(*enc, bytes); + assert_eq!(decoded, "Hello, World!"); + assert!(!malformed); + } + } + + #[test] + fn test_decode_malformed_shift_jis() { + // Invalid Shift-JIS: lead byte 0x83 followed by ASCII range byte + // This is not a valid Shift-JIS sequence + let bytes = [0x83, 0x20]; // 0x83 is a lead byte, 0x20 is ASCII space + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &bytes); + // Should contain replacement character and report malformed + assert!(malformed); + assert!(decoded.contains('\u{FFFD}') || decoded.len() < 2); + } + + #[test] + fn test_decode_malformed_gb18030() { + // Invalid GB18030: incomplete multi-byte sequence + let bytes = [0x81]; // Lead byte without trail byte + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &bytes); + assert!(malformed); + // Should contain replacement character + assert!(decoded.contains('\u{FFFD}') || decoded == "\u{FFFD}"); + } + + #[test] + fn test_round_trip_shift_jis() { + let test_str = "テスト"; + let (shift_jis_bytes, _, _) = encoding_rs::SHIFT_JIS.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &shift_jis_bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_round_trip_gb18030() { + let test_str = "中文测试"; + let (gb18030_bytes, _, _) = encoding_rs::GB18030.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &gb18030_bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_round_trip_big5() { + let test_str = "測試"; + let (big5_bytes, _, _) = encoding_rs::BIG5.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &big5_bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_round_trip_euc_kr() { + let test_str = "한글"; + let (euc_kr_bytes, _, _) = encoding_rs::EUC_KR.encode(test_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::EucKr, &euc_kr_bytes); + assert_eq!(decoded, test_str); + assert!(!malformed); + } + + #[test] + fn test_encoding_names() { + assert_eq!(CjkEncoding::ShiftJis.name(), "Shift-JIS"); + assert_eq!(CjkEncoding::Gb18030.name(), "GB18030"); + assert_eq!(CjkEncoding::Big5.name(), "Big5"); + assert_eq!(CjkEncoding::EucKr.name(), "EUC-KR"); + } + + #[test] + fn test_big5_hkscs_extension() { + // Big5-HKSCS adds Hong Kong-specific characters + // The encoding_rs BIG5 implementation includes this extension + // Test with a character that's more likely to be in the Big5-HKSCS range + let hkscs_str = "香港"; // "Hong Kong" in Traditional Chinese + let (big5_bytes, _, _) = encoding_rs::BIG5.encode(hkscs_str); + let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &big5_bytes); + // The characters should round-trip + if !big5_bytes.is_empty() { + assert_eq!(decoded, hkscs_str); + assert!(!malformed); + } + } + + #[test] + fn test_malformed_no_panic() { + // Test various malformed inputs that should not panic + let malformed_inputs: Vec<&[u8]> = vec![ + &[0xFF], // Invalid lead byte in Shift-JIS + &[0x80, 0x80], // Invalid sequence in GB18030 + &[0xFE, 0xFF], // Invalid in Big5 + &[0xFF, 0xFF], // Invalid in EUC-KR + ]; + + for (i, bytes) in malformed_inputs.iter().enumerate() { + for enc in &[ + CjkEncoding::ShiftJis, + CjkEncoding::Gb18030, + CjkEncoding::Big5, + CjkEncoding::EucKr, + ] { + let (decoded, had_malformed) = decode_cjk_bytes(*enc, bytes); + // Should not panic and should return a valid String + assert!(!decoded.is_empty() || had_malformed || decoded == "\u{FFFD}"); + } + } + } +} diff --git a/crates/pdftract-core/src/font/mod.rs b/crates/pdftract-core/src/font/mod.rs index 4b2e50f..2362e35 100644 --- a/crates/pdftract-core/src/font/mod.rs +++ b/crates/pdftract-core/src/font/mod.rs @@ -13,6 +13,9 @@ pub mod fingerprint; pub mod resolver; pub mod predefined_cmap; +#[cfg(feature = "cjk")] +pub mod cjk_encoding; + pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox}; pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap}; pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags}; @@ -22,6 +25,9 @@ pub use fingerprint::{FontFingerprint, CachedFingerprint, lookup_font_fingerprin pub use resolver::{FontId, UnicodeSource, ResolvedGlyph, ResolverCache, Font, resolve_unicode}; pub use predefined_cmap::{PredefinedCMap, from_name as predefined_cmap_from_name, CharacterCollection}; +#[cfg(feature = "cjk")] +pub use cjk_encoding::{CjkEncoding, decode_cjk_bytes}; + use crate::parser::object::types::{PdfDict, PdfObject}; /// Font type classification. diff --git a/notes/pdftract-5rmc.md b/notes/pdftract-5rmc.md new file mode 100644 index 0000000..f33ceab --- /dev/null +++ b/notes/pdftract-5rmc.md @@ -0,0 +1,79 @@ +# pdftract-5rmc: encoding_rs adapter for CJK encodings + +## Summary + +Implemented a thin wrapper around `encoding_rs` for decoding the four major CJK byte encodings used in legacy PDFs: Shift-JIS, GB18030, Big5, and EUC-KR. + +## Implementation + +### Files Modified +- `crates/pdftract-core/src/font/cjk_encoding.rs` (NEW) - CJK encoding adapter + +### API +```rust +pub enum CjkEncoding { + ShiftJis, // Japanese (JIS X 0208) + Gb18030, // Chinese (PRC standard) + Big5, // Traditional Chinese (with Big5-HKSCS extension) + EucKr, // Korean (KS X 1001 + Unified Hangul) +} + +pub fn decode_cjk_bytes(enc: CjkEncoding, bytes: &[u8]) -> (String, bool) +// Returns (decoded_text, had_malformed_bytes) +``` + +### Design Decisions +1. **Uses `decode_without_bom_handling`**: PDF byte streams never have a BOM +2. **Returns malformed indicator**: Caller decides whether to emit `CJK_DECODE_MALFORMED` diagnostic +3. **Feature-gated**: `#[cfg(feature = "cjk")]` alongside predefined CMap registry +4. **encoding_rs singletons**: Uses `SHIFT_JIS`, `GB18030`, `BIG5`, `EUC_KR` directly + +### Supporting Changes (Already in Place) +- `encoding_rs` dependency added to `Cargo.toml` (optional, enabled by `cjk` feature) +- `CjkDecodeMalformed` diagnostic code added to `diagnostics.rs` +- Module exports added to `font/mod.rs` + +## Acceptance Criteria + +| Criterion | Status | Notes | +|-----------|--------|-------| +| All 4 encodings decode known sample byte sequences | PASS | Tests use encoding_rs to verify correct encoding | +| Malformed input produces U+FFFD chars | PASS | encoding_rs replaces malformed sequences automatically | +| Diagnostic emission capability | PASS | `bool` return value indicates malformed; caller emits diagnostic | +| Empty input returns empty string | PASS | Explicit check at start of `decode_cjk_bytes` | +| No panic on any input | PASS | `test_malformed_no_panic` verifies various malformed inputs | +| API is `cfg(feature = "cjk")`-gated | PASS | Module and exports gated behind `cjk` feature | +| Round-trip tests (encode → decode → verify) | PASS | All 4 encodings round-trip correctly | + +## Test Results +``` +running 15 tests +test font::cjk_encoding::tests::test_decode_ascii_passthrough ... ok +test font::cjk_encoding::tests::test_big5_hkscs_extension ... ok +test font::cjk_encoding::tests::test_decode_empty_input ... ok +test font::cjk_encoding::tests::test_decode_big5_valid ... ok +test font::cjk_encoding::tests::test_decode_euc_kr_valid ... ok +test font::cjk_encoding::tests::test_decode_malformed_gb18030 ... ok +test font::cjk_encoding::tests::test_decode_gb18030_valid ... ok +test font::cjk_encoding::tests::test_decode_malformed_shift_jis ... ok +test font::cjk_encoding::tests::test_decode_shift_jis_valid ... ok +test font::cjk_encoding::tests::test_encoding_names ... ok +test font::cjk_encoding::tests::test_round_trip_big5 ... ok +test font::cjk_encoding::tests::test_malformed_no_panic ... ok +test font::cjk_encoding::tests::test_round_trip_euc_kr ... ok +test font::cjk_encoding::tests::test_round_trip_shift_jis ... ok +test font::cjk_encoding::tests::test_round_trip_gb18030 ... ok + +test result: ok. 15 passed; 0 failed; 0 ignored +``` + +## Notes +- encoding_rs is the gold-standard Rust implementation (powers Firefox) +- Big5 implementation includes Big5-HKSCS extension for Hong Kong-specific characters +- GB18030 is 1-2-4 byte variable-width; encoding_rs handles this correctly +- EUC-KR covers KS X 1001 + Unified Hangul +- Fallback path only fires when: raw encoding name OR unrecognized CMap + CJK lead byte range + +## References +- Plan section: Phase 2.3 (lines 1382-1386) +- encoding_rs crate: https://docs.rs/encoding_rs/