Implements Phase 4.7 Correction Pipeline step 3: mojibake detection and repair for Latin-1 bytes misinterpreted as UTF-8. Changes: - Add layout::correction module with detect_and_repair_mojibake function - Implement CorrectableText trait for mutable text access - Add trait implementations for hybrid::Span and schema::SpanJson - Make encoding_rs a non-optional dependency (was cjk-gated) - Detection heuristic: 2+ occurrences of telltale sequences (é, è, ’, etc.) - Re-decode via encoding_rs::WINDOWS_1252 when detected - Accept repair only if readability score improves by >0.05 epsilon - Fast-path pass-through for ASCII-only and clean UTF-8 text Closes: pdftract-5qj50 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
82 lines
2.4 KiB
TOML
82 lines
2.4 KiB
TOML
[package]
|
|
name = "pdftract-core"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
rust-version.workspace = true
|
|
license.workspace = true
|
|
repository.workspace = true
|
|
publish = true
|
|
|
|
[dependencies]
|
|
anyhow = { workspace = true }
|
|
hex = "0.4"
|
|
image = { version = "0.25", optional = true }
|
|
url = { version = "2.5", optional = true }
|
|
leptonica-plumbing = { version = "1.4", optional = true }
|
|
pdfium-render = { version = "0.9", optional = true }
|
|
tesseract = { version = "0.15", optional = true }
|
|
indexmap = "2.2"
|
|
flate2 = { workspace = true }
|
|
lzw = { workspace = true }
|
|
memmap2 = "0.9"
|
|
regex = "1.10"
|
|
secrecy = { workspace = true }
|
|
serde = { version = "1.0", features = ["derive"], optional = true }
|
|
serde_json = { version = "1.0", optional = true }
|
|
schemars = { version = "1.2", features = ["derive"], optional = true }
|
|
sha2 = "0.10"
|
|
thiserror = { workspace = true }
|
|
memchr = { workspace = true }
|
|
unicode-normalization = { workspace = true }
|
|
ttf-parser = "0.24"
|
|
owned_ttf_parser = "0.21"
|
|
zstd = "0.13"
|
|
rayon = "1.10"
|
|
phf = "0.11"
|
|
rand = "0.8"
|
|
tempfile = "3.10"
|
|
tracing = { workspace = true }
|
|
dashmap = "6.1"
|
|
smallvec = "1.13"
|
|
encoding_rs = "0.8"
|
|
quick-xml = { version = "0.36", optional = true }
|
|
serde_yaml = { version = "0.9", optional = true }
|
|
|
|
[features]
|
|
default = ["serde"]
|
|
serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
|
|
schemars = ["dep:schemars", "serde"]
|
|
receipts = [] # Enable visual citation receipts (SVG clip generation)
|
|
ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"] # Enable OCR path (image compositing + preprocessing + HOCR parsing)
|
|
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)
|
|
remote = ["dep:url"] # Enable remote HTTP source (Phase 1.8)
|
|
profiles = ["dep:serde_yaml"] # Enable extraction profiles (Phase 7.10)
|
|
proptest = []
|
|
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses
|
|
shape-db = [] # Enable glyph shape database (Level 4 encoding fallback)
|
|
cjk = [] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
|
|
|
|
[dev-dependencies]
|
|
chrono = "0.4"
|
|
criterion = "0.5"
|
|
proptest = "1.4"
|
|
quick-xml = "0.36"
|
|
regex = "1.10"
|
|
serde = { version = "1.0", features = ["derive"] }
|
|
serde_json = "1.0"
|
|
tempfile = "3.10"
|
|
filetime = "0.2"
|
|
libc = "0.2"
|
|
|
|
[[bench]]
|
|
name = "table_detection"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "wordlist"
|
|
harness = false
|
|
|
|
[build-dependencies]
|
|
phf_codegen = "0.11"
|
|
serde = { version = "1.0", features = ["derive"] }
|
|
serde_json = "1.0"
|