pdftract/crates/pdftract-core/Cargo.toml
jedarden 06079a16b2
Some checks are pending
Schema Generation Validation / Validate JSON Schema (push) Waiting to run
Schema Generation Validation / Validate JSON Syntax (push) Waiting to run
feat(pdftract-4bylb): implement Docstrum fallback for reading order
Implement O'Gorman 1993 Docstrum algorithm for reading order detection
on irregular layouts (magazines with sidebars) where XY-cut produces
fragmented regions.

Implementation:
- k=5 nearest neighbors per block (Docstrum standard)
- Euclidean center-to-center distance in PDF user space
- Angle constraints: ±30° from horizontal (within-line) and vertical (between-line)
- Root detection: nodes with no incoming edges from blocks above
- Root sorting by (column ASC, y DESC)
- DFS traversal per component in y-then-x order

Acceptance criteria PASS:
- Magazine main+sidebar: 2 components; main first, sidebar second
- Pathological scattered: each a root, visited (column, y desc)
- All-one-line horizontal: 1 component, left-to-right
- All-one-column vertical: 1 component, top-to-bottom

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 04:16:24 -04:00

103 lines
3.4 KiB
TOML

[package]
name = "pdftract-core"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
repository.workspace = true
publish = true
[dependencies]
anyhow = { workspace = true }
base64 = { workspace = true }
hex = "0.4"
image = { version = "0.25", optional = true }
imageproc = { version = "0.26", optional = true }
url = { version = "2.5", optional = true }
leptonica-plumbing = { version = "1.4", optional = true }
pdfium-render = { version = "0.9", optional = true }
tesseract = { version = "0.15", optional = true }
indexmap = "2.2"
flate2 = { workspace = true }
lzw = { workspace = true }
memmap2 = "0.9"
bytes = "1"
parking_lot = "0.12"
regex = "1.10"
secrecy = { workspace = true }
serde = { version = "1.0", features = ["derive", "rc"], optional = true }
serde_json = { version = "1.0", optional = true }
schemars = { version = "1.2", features = ["derive"], optional = true }
sha2 = "0.10"
thiserror = { workspace = true }
memchr = { workspace = true }
unicode-normalization = { workspace = true }
ttf-parser = "0.24"
owned_ttf_parser = "0.21"
zstd = "0.13"
rayon = "1.10"
phf = "0.11"
rand = "0.8"
tempfile = "3.10"
tracing = { workspace = true }
dashmap = "6.1"
smallvec = "1.13"
encoding_rs = "0.8"
quick-xml = { version = "0.36", optional = true }
serde_yaml = { version = "0.9", optional = true }
chrono = "0.4"
aes = { version = "0.8", optional = true }
rc4 = { version = "0.1", optional = true }
md-5 = { version = "0.10", optional = true }
cbc = { version = "0.1", optional = true, features = ["std"] }
cipher = { version = "0.4", optional = true, features = ["block-padding"] }
digest = { version = "0.10", optional = true }
hmac = "0.12"
unicode-segmentation = "1.11"
strsim = "0.11"
unicode-bidi = { workspace = true }
lru = { version = "0.12", optional = true }
ureq = { version = "2.10", default-features = false, features = ["tls"], optional = true }
rustls = { version = "0.23", optional = true }
[features]
default = ["serde", "decrypt", "quick-xml"]
serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
schemars = ["dep:schemars", "serde"]
receipts = [] # Enable visual citation receipts (SVG clip generation)
ocr = ["dep:image", "dep:imageproc", "dep:leptonica-plumbing"] # Enable OCR path (image compositing + preprocessing + HOCR parsing)
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)
remote = ["dep:url", "dep:ureq", "dep:lru"] # Enable remote HTTP source (Phase 1.8)
profiles = ["dep:serde_yaml"] # Enable extraction profiles (Phase 7.10)
decrypt = ["dep:aes", "dep:rc4", "dep:md-5", "dep:cbc", "dep:cipher", "dep:digest"] # Enable PDF decryption (RC4/AES-128/AES-256)
proptest = []
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses
shape-db = [] # Enable glyph shape database (Level 4 encoding fallback)
cjk = [] # Enable CJK text extraction via predefined CMap registry (~1.2 MB binary size increase)
quick-xml = ["dep:quick-xml"] # Enable quick-xml for conformance detection (Phase 1.4)
[dev-dependencies]
chrono = "0.4"
criterion = "0.5"
proptest = "1.4"
quick-xml = "0.36"
regex = "1.10"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tempfile = "3.10"
filetime = "0.2"
libc = "0.2"
[[bench]]
name = "table_detection"
harness = false
[[bench]]
name = "wordlist"
harness = false
[build-dependencies]
phf_codegen = "0.11"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
sha2 = "0.10"