feat(pdftract-5rmc): implement encoding_rs adapter for CJK encodings
Implements decode_cjk_bytes() function wrapping encoding_rs for the four major CJK byte encodings used in legacy PDFs: Shift-JIS, GB18030, Big5, and EUC-KR. Used by Phase 2.3 fallback path when fonts use raw byte encodings instead of proper CMap/ToUnicode mappings. - Add CjkEncoding enum with ShiftJis, Gb18030, Big5, EucKr variants - Implement decode_cjk_bytes(enc, bytes) -> (String, bool) - Use decode_without_bom_handling (PDF byte streams never have BOM) - Return bool indicating malformed bytes for caller to emit diagnostic - Add 15 tests covering valid input, malformed input, empty input, round-trips Supporting changes: - Add encoding_rs dependency (optional, gated by cjk feature) - Add CjkDecodeMalformed diagnostic code - Export CjkEncoding and decode_cjk_bytes from font module Refs: pdftract-5rmc, plan.md Phase 2.3 (lines 1382-1386) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
5ef3fa6d28
commit
4991243475
5 changed files with 411 additions and 1 deletions
|
|
@ -33,6 +33,7 @@ phf = "0.11"
|
|||
tracing = { workspace = true }
|
||||
dashmap = "6.1"
|
||||
smallvec = "1.13"
|
||||
encoding_rs = { version = "0.8", optional = true }
|
||||
|
||||
[features]
|
||||
default = ["serde"]
|
||||
|
|
@ -43,7 +44,7 @@ full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (req
|
|||
proptest = []
|
||||
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses
|
||||
shape-db = [] # Enable glyph shape database (Level 4 encoding fallback)
|
||||
cjk = [] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
|
||||
cjk = ["dep:encoding_rs"] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
|
||||
|
||||
[dev-dependencies]
|
||||
chrono = "0.4"
|
||||
|
|
|
|||
|
|
@ -575,6 +575,16 @@ pub enum DiagCode {
|
|||
/// Phase origin: 2.2
|
||||
FontEncodingDifferenceOutOfRange,
|
||||
|
||||
/// Malformed byte sequence in CJK encoding fallback
|
||||
///
|
||||
/// Emitted when a CJK byte encoding (Shift-JIS, GB18030, Big5, or EUC-KR)
|
||||
/// contains malformed byte sequences. The offending bytes are replaced
|
||||
/// with U+FFFD (Unicode REPLACEMENT CHARACTER).
|
||||
///
|
||||
/// Phase origin: 2.3
|
||||
#[cfg(feature = "cjk")]
|
||||
CjkDecodeMalformed,
|
||||
|
||||
// === OCR_* codes ===
|
||||
|
||||
/// JBIG2 decoder not available
|
||||
|
|
@ -845,6 +855,9 @@ impl DiagCode {
|
|||
| DiagCode::FontCidtogidmapTruncated
|
||||
| DiagCode::FontEncodingDifferenceOutOfRange => "FONT",
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
DiagCode::CjkDecodeMalformed => "CJK",
|
||||
|
||||
// OCR_*
|
||||
DiagCode::OcrJbig2Unsupported
|
||||
| DiagCode::OcrJpxUnsupported
|
||||
|
|
@ -939,6 +952,8 @@ impl DiagCode {
|
|||
DiagCode::FontUnsupported => "FONT_UNSUPPORTED",
|
||||
DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED",
|
||||
DiagCode::FontEncodingDifferenceOutOfRange => "ENCODING_DIFFERENCE_OUT_OF_RANGE",
|
||||
#[cfg(feature = "cjk")]
|
||||
DiagCode::CjkDecodeMalformed => "CJK_DECODE_MALFORMED",
|
||||
DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED",
|
||||
DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED",
|
||||
DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED",
|
||||
|
|
@ -1039,6 +1054,9 @@ impl DiagCode {
|
|||
| DiagCode::CacheEntryCorrupt
|
||||
| DiagCode::CacheWriteFailed => Severity::Warning,
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
DiagCode::CjkDecodeMalformed => Severity::Warning,
|
||||
|
||||
DiagCode::StreamBomb
|
||||
| DiagCode::PageOutOfRange
|
||||
| DiagCode::RemoteFetchInterrupted
|
||||
|
|
@ -1498,6 +1516,15 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "2.2",
|
||||
suggested_action: "A /Differences array contains a character code outside 0-255; the code was clamped",
|
||||
},
|
||||
#[cfg(feature = "cjk")]
|
||||
DiagInfo {
|
||||
code: DiagCode::CjkDecodeMalformed,
|
||||
category: "CJK",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "2.3",
|
||||
suggested_action: "The CJK byte sequence contained malformed bytes, replaced with U+FFFD",
|
||||
},
|
||||
// === OCR_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::OcrJbig2Unsupported,
|
||||
|
|
|
|||
297
crates/pdftract-core/src/font/cjk_encoding.rs
Normal file
297
crates/pdftract-core/src/font/cjk_encoding.rs
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
//! CJK byte encoding adapter for raw byte fallback.
|
||||
//!
|
||||
//! This module provides a thin wrapper around `encoding_rs` for decoding
|
||||
//! the four major CJK byte encodings used in legacy PDFs:
|
||||
//! - Shift-JIS (Japanese)
|
||||
//! - GB18030 (Chinese)
|
||||
//! - Big5 (Traditional Chinese, with Big5-HKSCS extension)
|
||||
//! - EUC-KR (Korean, covering KS X 1001 + Unified Hangul)
|
||||
//!
|
||||
//! These are FALLBACK encodings used when:
|
||||
//! - A font's encoding indicates a raw byte encoding (e.g., /Encoding /ShiftJIS)
|
||||
//! - No CMap or ToUnicode is present
|
||||
//! - The lead byte is in a CJK range
|
||||
//!
|
||||
//! The primary text extraction path uses predefined CMaps + ToUnicode; this
|
||||
//! module is only for legacy PDFs that don't provide proper Unicode mappings.
|
||||
|
||||
/// CJK byte encoding identifier.
|
||||
///
|
||||
/// Represents the four major legacy CJK encodings used in PDFs. These are
|
||||
/// raw byte encodings that need to be decoded to Unicode for text extraction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CjkEncoding {
|
||||
/// Shift-JIS (JIS X 0208 + extensions)
|
||||
///
|
||||
/// The most common encoding for Japanese PDFs. Variable-width: 1 byte for
|
||||
/// ASCII (0x00-0x7F), 2 bytes for JIS X 0208 characters (lead byte 0x81-0x9F,
|
||||
/// 0xE0-0xEF).
|
||||
ShiftJis,
|
||||
|
||||
/// GB18030 (Chinese national standard)
|
||||
///
|
||||
/// The mandatory encoding for PRC PDFs. Variable-width: 1, 2, or 4 bytes.
|
||||
/// Covers all Unicode code points assigned to Chinese characters.
|
||||
Gb18030,
|
||||
|
||||
/// Big5 (Traditional Chinese, with Big5-HKSCS extension)
|
||||
///
|
||||
/// Common encoding for Traditional Chinese PDFs (Taiwan, Hong Kong).
|
||||
/// Variable-width: 1 byte for ASCII (0x00-0x7F), 2 bytes for Big5 characters
|
||||
/// (lead byte 0x81-0xFE). The encoding_rs implementation includes the
|
||||
/// Big5-HKSCS extension for Hong Kong-specific characters.
|
||||
Big5,
|
||||
|
||||
/// EUC-KR (KS X 1001 + Unified Hangul)
|
||||
///
|
||||
/// The standard encoding for Korean PDFs. Variable-width: 1 byte for ASCII
|
||||
/// (0x00-0x7F), 2 bytes for KS X 1001 characters (lead byte 0x81-0xFE).
|
||||
/// The encoding_rs implementation covers KS X 1001 + Unified Hangul.
|
||||
EucKr,
|
||||
}
|
||||
|
||||
impl CjkEncoding {
|
||||
/// Get the encoding_rs singleton for this encoding.
|
||||
fn encoding(&self) -> &'static encoding_rs::Encoding {
|
||||
match self {
|
||||
CjkEncoding::ShiftJis => encoding_rs::SHIFT_JIS,
|
||||
CjkEncoding::Gb18030 => encoding_rs::GB18030,
|
||||
CjkEncoding::Big5 => encoding_rs::BIG5,
|
||||
CjkEncoding::EucKr => encoding_rs::EUC_KR,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the name of this encoding for diagnostic messages.
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
CjkEncoding::ShiftJis => "Shift-JIS",
|
||||
CjkEncoding::Gb18030 => "GB18030",
|
||||
CjkEncoding::Big5 => "Big5",
|
||||
CjkEncoding::EucKr => "EUC-KR",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode CJK-encoded bytes to a String.
|
||||
///
|
||||
/// This is a fallback path for legacy PDFs that use raw byte encodings instead
|
||||
/// of proper CMap/ToUnicode mappings. The function uses `encoding_rs` to decode
|
||||
/// the byte sequence according to the specified encoding.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `enc` - The CJK encoding to use for decoding
|
||||
/// * `bytes` - The raw byte sequence to decode
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A tuple `(String, bool)` where:
|
||||
/// - The `String` is the decoded Unicode text (with U+FFFD for malformed bytes)
|
||||
/// - The `bool` is `true` if any malformed bytes were encountered, `false` otherwise
|
||||
///
|
||||
/// # Behavior
|
||||
///
|
||||
/// - Empty input returns an empty string with `malformed = false`
|
||||
/// - Malformed byte sequences are replaced with U+FFFD (Unicode REPLACEMENT CHARACTER)
|
||||
/// - No panic occurs on any input
|
||||
/// - PDF byte streams never have a BOM, so we use `decode_without_bom_handling`
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::font::cjk_encoding::{decode_cjk_bytes, CjkEncoding};
|
||||
///
|
||||
/// // Round-trip: encode "テスト" as Shift-JIS bytes, decode -> get "テスト" back
|
||||
/// let test_str = "テスト";
|
||||
/// let shift_jis_bytes = encoding_rs::SHIFT_JIS.encode(test_str);
|
||||
/// let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &shift_jis_bytes);
|
||||
/// assert_eq!(decoded, test_str);
|
||||
/// assert!(!malformed);
|
||||
/// ```
|
||||
pub fn decode_cjk_bytes(enc: CjkEncoding, bytes: &[u8]) -> (String, bool) {
|
||||
if bytes.is_empty() {
|
||||
return (String::new(), false);
|
||||
}
|
||||
|
||||
let encoding = enc.encoding();
|
||||
let (cow, had_malformed) = encoding.decode_without_bom_handling(bytes);
|
||||
|
||||
// The encoding_rs decoder already replaces malformed sequences with U+FFFD
|
||||
// We just need to convert Cow<str> to String and report the malformed status
|
||||
(cow.into_owned(), had_malformed)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_decode_shift_jis_valid() {
|
||||
// "テスト" in Shift-JIS
|
||||
// 0x83 0x65 = テ, 0x83 0x58 = ス, 0x83 0x67 = ト
|
||||
let bytes = [0x83, 0x65, 0x83, 0x58, 0x83, 0x67];
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &bytes);
|
||||
assert_eq!(decoded, "テスト");
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_gb18030_valid() {
|
||||
// "中文测试" in GB18030
|
||||
// Verify correct encoding by encoding the string first
|
||||
let test_str = "中文测试";
|
||||
let (bytes, _, _) = encoding_rs::GB18030.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_big5_valid() {
|
||||
// "測試" in Big5 (Traditional Chinese)
|
||||
// Verify correct encoding by encoding the string first
|
||||
let test_str = "測試";
|
||||
let (bytes, _, _) = encoding_rs::BIG5.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_euc_kr_valid() {
|
||||
// "한글" in EUC-KR (Korean)
|
||||
// Verify correct encoding by encoding the string first
|
||||
let test_str = "한글";
|
||||
let (bytes, _, _) = encoding_rs::EUC_KR.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::EucKr, &bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_empty_input() {
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &[]);
|
||||
assert_eq!(decoded, "");
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_ascii_passthrough() {
|
||||
// ASCII should pass through unchanged in all encodings
|
||||
let bytes = b"Hello, World!";
|
||||
for enc in &[
|
||||
CjkEncoding::ShiftJis,
|
||||
CjkEncoding::Gb18030,
|
||||
CjkEncoding::Big5,
|
||||
CjkEncoding::EucKr,
|
||||
] {
|
||||
let (decoded, malformed) = decode_cjk_bytes(*enc, bytes);
|
||||
assert_eq!(decoded, "Hello, World!");
|
||||
assert!(!malformed);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_malformed_shift_jis() {
|
||||
// Invalid Shift-JIS: lead byte 0x83 followed by ASCII range byte
|
||||
// This is not a valid Shift-JIS sequence
|
||||
let bytes = [0x83, 0x20]; // 0x83 is a lead byte, 0x20 is ASCII space
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &bytes);
|
||||
// Should contain replacement character and report malformed
|
||||
assert!(malformed);
|
||||
assert!(decoded.contains('\u{FFFD}') || decoded.len() < 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_malformed_gb18030() {
|
||||
// Invalid GB18030: incomplete multi-byte sequence
|
||||
let bytes = [0x81]; // Lead byte without trail byte
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &bytes);
|
||||
assert!(malformed);
|
||||
// Should contain replacement character
|
||||
assert!(decoded.contains('\u{FFFD}') || decoded == "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_round_trip_shift_jis() {
|
||||
let test_str = "テスト";
|
||||
let (shift_jis_bytes, _, _) = encoding_rs::SHIFT_JIS.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::ShiftJis, &shift_jis_bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_round_trip_gb18030() {
|
||||
let test_str = "中文测试";
|
||||
let (gb18030_bytes, _, _) = encoding_rs::GB18030.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Gb18030, &gb18030_bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_round_trip_big5() {
|
||||
let test_str = "測試";
|
||||
let (big5_bytes, _, _) = encoding_rs::BIG5.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &big5_bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_round_trip_euc_kr() {
|
||||
let test_str = "한글";
|
||||
let (euc_kr_bytes, _, _) = encoding_rs::EUC_KR.encode(test_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::EucKr, &euc_kr_bytes);
|
||||
assert_eq!(decoded, test_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encoding_names() {
|
||||
assert_eq!(CjkEncoding::ShiftJis.name(), "Shift-JIS");
|
||||
assert_eq!(CjkEncoding::Gb18030.name(), "GB18030");
|
||||
assert_eq!(CjkEncoding::Big5.name(), "Big5");
|
||||
assert_eq!(CjkEncoding::EucKr.name(), "EUC-KR");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_hkscs_extension() {
|
||||
// Big5-HKSCS adds Hong Kong-specific characters
|
||||
// The encoding_rs BIG5 implementation includes this extension
|
||||
// Test with a character that's more likely to be in the Big5-HKSCS range
|
||||
let hkscs_str = "香港"; // "Hong Kong" in Traditional Chinese
|
||||
let (big5_bytes, _, _) = encoding_rs::BIG5.encode(hkscs_str);
|
||||
let (decoded, malformed) = decode_cjk_bytes(CjkEncoding::Big5, &big5_bytes);
|
||||
// The characters should round-trip
|
||||
if !big5_bytes.is_empty() {
|
||||
assert_eq!(decoded, hkscs_str);
|
||||
assert!(!malformed);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_no_panic() {
|
||||
// Test various malformed inputs that should not panic
|
||||
let malformed_inputs: Vec<&[u8]> = vec![
|
||||
&[0xFF], // Invalid lead byte in Shift-JIS
|
||||
&[0x80, 0x80], // Invalid sequence in GB18030
|
||||
&[0xFE, 0xFF], // Invalid in Big5
|
||||
&[0xFF, 0xFF], // Invalid in EUC-KR
|
||||
];
|
||||
|
||||
for (i, bytes) in malformed_inputs.iter().enumerate() {
|
||||
for enc in &[
|
||||
CjkEncoding::ShiftJis,
|
||||
CjkEncoding::Gb18030,
|
||||
CjkEncoding::Big5,
|
||||
CjkEncoding::EucKr,
|
||||
] {
|
||||
let (decoded, had_malformed) = decode_cjk_bytes(*enc, bytes);
|
||||
// Should not panic and should return a valid String
|
||||
assert!(!decoded.is_empty() || had_malformed || decoded == "\u{FFFD}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -13,6 +13,9 @@ pub mod fingerprint;
|
|||
pub mod resolver;
|
||||
pub mod predefined_cmap;
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
pub mod cjk_encoding;
|
||||
|
||||
pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox};
|
||||
pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap};
|
||||
pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags};
|
||||
|
|
@ -22,6 +25,9 @@ pub use fingerprint::{FontFingerprint, CachedFingerprint, lookup_font_fingerprin
|
|||
pub use resolver::{FontId, UnicodeSource, ResolvedGlyph, ResolverCache, Font, resolve_unicode};
|
||||
pub use predefined_cmap::{PredefinedCMap, from_name as predefined_cmap_from_name, CharacterCollection};
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
pub use cjk_encoding::{CjkEncoding, decode_cjk_bytes};
|
||||
|
||||
use crate::parser::object::types::{PdfDict, PdfObject};
|
||||
|
||||
/// Font type classification.
|
||||
|
|
|
|||
79
notes/pdftract-5rmc.md
Normal file
79
notes/pdftract-5rmc.md
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# pdftract-5rmc: encoding_rs adapter for CJK encodings
|
||||
|
||||
## Summary
|
||||
|
||||
Implemented a thin wrapper around `encoding_rs` for decoding the four major CJK byte encodings used in legacy PDFs: Shift-JIS, GB18030, Big5, and EUC-KR.
|
||||
|
||||
## Implementation
|
||||
|
||||
### Files Modified
|
||||
- `crates/pdftract-core/src/font/cjk_encoding.rs` (NEW) - CJK encoding adapter
|
||||
|
||||
### API
|
||||
```rust
|
||||
pub enum CjkEncoding {
|
||||
ShiftJis, // Japanese (JIS X 0208)
|
||||
Gb18030, // Chinese (PRC standard)
|
||||
Big5, // Traditional Chinese (with Big5-HKSCS extension)
|
||||
EucKr, // Korean (KS X 1001 + Unified Hangul)
|
||||
}
|
||||
|
||||
pub fn decode_cjk_bytes(enc: CjkEncoding, bytes: &[u8]) -> (String, bool)
|
||||
// Returns (decoded_text, had_malformed_bytes)
|
||||
```
|
||||
|
||||
### Design Decisions
|
||||
1. **Uses `decode_without_bom_handling`**: PDF byte streams never have a BOM
|
||||
2. **Returns malformed indicator**: Caller decides whether to emit `CJK_DECODE_MALFORMED` diagnostic
|
||||
3. **Feature-gated**: `#[cfg(feature = "cjk")]` alongside predefined CMap registry
|
||||
4. **encoding_rs singletons**: Uses `SHIFT_JIS`, `GB18030`, `BIG5`, `EUC_KR` directly
|
||||
|
||||
### Supporting Changes (Already in Place)
|
||||
- `encoding_rs` dependency added to `Cargo.toml` (optional, enabled by `cjk` feature)
|
||||
- `CjkDecodeMalformed` diagnostic code added to `diagnostics.rs`
|
||||
- Module exports added to `font/mod.rs`
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
| Criterion | Status | Notes |
|
||||
|-----------|--------|-------|
|
||||
| All 4 encodings decode known sample byte sequences | PASS | Tests use encoding_rs to verify correct encoding |
|
||||
| Malformed input produces U+FFFD chars | PASS | encoding_rs replaces malformed sequences automatically |
|
||||
| Diagnostic emission capability | PASS | `bool` return value indicates malformed; caller emits diagnostic |
|
||||
| Empty input returns empty string | PASS | Explicit check at start of `decode_cjk_bytes` |
|
||||
| No panic on any input | PASS | `test_malformed_no_panic` verifies various malformed inputs |
|
||||
| API is `cfg(feature = "cjk")`-gated | PASS | Module and exports gated behind `cjk` feature |
|
||||
| Round-trip tests (encode → decode → verify) | PASS | All 4 encodings round-trip correctly |
|
||||
|
||||
## Test Results
|
||||
```
|
||||
running 15 tests
|
||||
test font::cjk_encoding::tests::test_decode_ascii_passthrough ... ok
|
||||
test font::cjk_encoding::tests::test_big5_hkscs_extension ... ok
|
||||
test font::cjk_encoding::tests::test_decode_empty_input ... ok
|
||||
test font::cjk_encoding::tests::test_decode_big5_valid ... ok
|
||||
test font::cjk_encoding::tests::test_decode_euc_kr_valid ... ok
|
||||
test font::cjk_encoding::tests::test_decode_malformed_gb18030 ... ok
|
||||
test font::cjk_encoding::tests::test_decode_gb18030_valid ... ok
|
||||
test font::cjk_encoding::tests::test_decode_malformed_shift_jis ... ok
|
||||
test font::cjk_encoding::tests::test_decode_shift_jis_valid ... ok
|
||||
test font::cjk_encoding::tests::test_encoding_names ... ok
|
||||
test font::cjk_encoding::tests::test_round_trip_big5 ... ok
|
||||
test font::cjk_encoding::tests::test_malformed_no_panic ... ok
|
||||
test font::cjk_encoding::tests::test_round_trip_euc_kr ... ok
|
||||
test font::cjk_encoding::tests::test_round_trip_shift_jis ... ok
|
||||
test font::cjk_encoding::tests::test_round_trip_gb18030 ... ok
|
||||
|
||||
test result: ok. 15 passed; 0 failed; 0 ignored
|
||||
```
|
||||
|
||||
## Notes
|
||||
- encoding_rs is the gold-standard Rust implementation (powers Firefox)
|
||||
- Big5 implementation includes Big5-HKSCS extension for Hong Kong-specific characters
|
||||
- GB18030 is 1-2-4 byte variable-width; encoding_rs handles this correctly
|
||||
- EUC-KR covers KS X 1001 + Unified Hangul
|
||||
- Fallback path only fires when: raw encoding name OR unrecognized CMap + CJK lead byte range
|
||||
|
||||
## References
|
||||
- Plan section: Phase 2.3 (lines 1382-1386)
|
||||
- encoding_rs crate: https://docs.rs/encoding_rs/
|
||||
Loading…
Add table
Reference in a new issue