From 5ef9ef7740c459a15f8e51dd994c58588a5bbf48 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 21:19:39 -0400 Subject: [PATCH] feat(pdftract-3wku): implement deskew via pixFindSkewAndDeskew Implement the deskew preprocessing step using leptonica's pixFindSkewAndDeskew (Hough line transform). The function: - Detects dominant text angle on grayscale input - Rotates by negative angle if >= 0.3 deg threshold - Returns input unchanged for negligible skews (< 0.3 deg) - Emits IMG_DESKEW_OUT_OF_RANGE diagnostic for angles > 15 deg - Returns detected angle for quality tracking Changes: - Add leptonica-plumbing dependency (ocr feature) - Create preprocess.rs module with deskew() function - Add ImgDeskewOutOfRange diagnostic code - Expose preprocess module in lib.rs The implementation uses pixFindSkewAndDeskew which both detects the skew angle and performs deskewing in one call, returning the detected angle for debugging purposes. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 243 +++++++++++++---- crates/pdftract-core/Cargo.toml | 3 +- crates/pdftract-core/src/diagnostics.rs | 22 +- crates/pdftract-core/src/lib.rs | 2 + crates/pdftract-core/src/preprocess.rs | 332 ++++++++++++++++++++++++ 5 files changed, 553 insertions(+), 49 deletions(-) create mode 100644 crates/pdftract-core/src/preprocess.rs diff --git a/Cargo.lock b/Cargo.lock index e62cf04..69d5033 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -143,7 +143,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -192,7 +192,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -203,7 +203,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -258,7 +258,7 @@ dependencies = [ "anyhow", "arrayvec", "log", - "nom", + "nom 8.0.0", "num-rational", "v_frame", ] @@ -334,6 +334,28 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bindgen" +version = "0.64.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 1.0.109", + "which", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -370,6 +392,12 @@ version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.1" @@ -481,7 +509,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn", + "syn 2.0.117", "tempfile", "toml", ] @@ -498,6 +526,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -546,6 +583,17 @@ dependencies = [ "phf_codegen", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.6.1" @@ -577,7 +625,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -767,7 +815,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -808,7 +856,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1040,6 +1088,12 @@ dependencies = [ "weezl", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "globset" version = "0.4.18" @@ -1059,7 +1113,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags", + "bitflags 2.11.1", "ignore", "walkdir", ] @@ -1146,6 +1200,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "http" version = "1.4.0" @@ -1488,7 +1551,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1509,7 +1572,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1082f0c48f143442a1ac6122f67e360ceee130b967af4d50996e5154a45df46" dependencies = [ - "nom", + "nom 8.0.0", ] [[package]] @@ -1585,6 +1648,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "leb128fmt" version = "0.1.0" @@ -1597,6 +1666,28 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" +[[package]] +name = "leptonica-plumbing" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7a74c43d6f090d39158d233f326f47cd8bba545217595c93662b4e31156f42" +dependencies = [ + "leptonica-sys", + "libc", + "thiserror 1.0.69", +] + +[[package]] +name = "leptonica-sys" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da627c72b2499a8106f4dd33143843015e4a631f445d561f3481f7fba35b6151" +dependencies = [ + "bindgen", + "pkg-config", + "vcpkg", +] + [[package]] name = "libc" version = "0.2.186" @@ -1735,6 +1826,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1798,6 +1895,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nom" version = "8.0.0" @@ -1866,7 +1973,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1986,7 +2093,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "076dd8f3a6c7da9298ddffbcc0d5a109f89caf967fa4871c9a172d5b3498b35b" dependencies = [ - "bitflags", + "bitflags 2.11.1", "bytemuck", "bytes", "chrono", @@ -2063,6 +2170,7 @@ dependencies = [ "hex", "image", "indexmap", + "leptonica-plumbing", "lzw", "memchr", "owned_ttf_parser", @@ -2104,6 +2212,12 @@ dependencies = [ "pyo3", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -2140,7 +2254,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2215,7 +2329,7 @@ version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" dependencies = [ - "bitflags", + "bitflags 2.11.1", "crc32fast", "fdeflate", "flate2", @@ -2259,7 +2373,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.117", ] [[package]] @@ -2287,7 +2401,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb" dependencies = [ "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2298,7 +2412,7 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", - "bitflags", + "bitflags 2.11.1", "num-traits", "rand 0.9.4", "rand_chacha 0.9.0", @@ -2362,7 +2476,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2375,7 +2489,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2419,7 +2533,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.1.2", "rustls", "socket2", "thiserror 2.0.18", @@ -2439,7 +2553,7 @@ dependencies = [ "lru-slab", "rand 0.9.4", "ring", - "rustc-hash", + "rustc-hash 2.1.2", "rustls", "rustls-pki-types", "slab", @@ -2628,7 +2742,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -2731,6 +2845,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.2" @@ -2743,7 +2863,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys 0.4.15", @@ -2756,7 +2876,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys 0.12.1", @@ -2853,7 +2973,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn", + "syn 2.0.117", ] [[package]] @@ -2904,7 +3024,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2915,7 +3035,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3091,6 +3211,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.117" @@ -3119,7 +3250,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3128,7 +3259,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags", + "bitflags 2.11.1", "core-foundation", "system-configuration-sys", ] @@ -3229,7 +3360,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3240,7 +3371,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3337,7 +3468,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3442,7 +3573,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "async-compression", - "bitflags", + "bitflags 2.11.1", "bytes", "futures-core", "http", @@ -3462,7 +3593,7 @@ version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ - "bitflags", + "bitflags 2.11.1", "bytes", "futures-util", "http", @@ -3506,7 +3637,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3672,6 +3803,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "vecmath" version = "1.0.0" @@ -3781,7 +3918,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -3822,7 +3959,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.1", "hashbrown 0.15.5", "indexmap", "semver", @@ -3872,6 +4009,18 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.44", +] + [[package]] name = "winapi" version = "0.3.9" @@ -3924,7 +4073,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3935,7 +4084,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4166,7 +4315,7 @@ dependencies = [ "heck 0.5.0", "indexmap", "prettyplease", - "syn", + "syn 2.0.117", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -4182,7 +4331,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -4194,7 +4343,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.1", "indexmap", "log", "serde", @@ -4255,7 +4404,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -4276,7 +4425,7 @@ checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4296,7 +4445,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -4336,7 +4485,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 9c41a35..fbf8671 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -11,6 +11,7 @@ publish = true anyhow = { workspace = true } hex = "0.4" image = { version = "0.25", optional = true } +leptonica-plumbing = { version = "1.4", optional = true } pdfium-render = { version = "0.9", optional = true } indexmap = "2.2" flate2 = { workspace = true } @@ -34,7 +35,7 @@ tracing = { workspace = true } default = ["serde"] serde = ["dep:serde", "dep:serde_json"] receipts = [] # Enable visual citation receipts (SVG clip generation) -ocr = ["dep:image"] # Enable OCR path (image compositing) +ocr = ["dep:image", "dep:leptonica-plumbing"] # Enable OCR path (image compositing + preprocessing) full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr) proptest = [] fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 78f60a6..2a23755 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -621,6 +621,15 @@ pub enum DiagCode { /// Phase origin: 5.2.1 ImgUnsupportedFormat, + /// Deskew angle out of detectable range + /// + /// Emitted when pixDeskew cannot detect a dominant text angle because the + /// actual skew exceeds the search range (typically +/- 15 degrees). The image + /// is returned unchanged without rotation. + /// + /// Phase origin: 5.3.1 + ImgDeskewOutOfRange, + /// Stream data truncated /// /// Emitted when a stream has less data than expected based on its declared @@ -827,7 +836,8 @@ impl DiagCode { // IMG_* DiagCode::ImgSoftmaskUnsupported - | DiagCode::ImgUnsupportedFormat => "IMG", + | DiagCode::ImgUnsupportedFormat + | DiagCode::ImgDeskewOutOfRange => "IMG", // REMOTE_* DiagCode::RemoteFetchInterrupted @@ -916,6 +926,7 @@ impl DiagCode { DiagCode::OcrBrokenVectorUnavailable => "OCR_BROKENVECTOR_UNAVAILABLE", DiagCode::ImgSoftmaskUnsupported => "IMG_SOFTMASK_UNSUPPORTED", DiagCode::ImgUnsupportedFormat => "IMG_UNSUPPORTED_FORMAT", + DiagCode::ImgDeskewOutOfRange => "IMG_DESKEW_OUT_OF_RANGE", DiagCode::StreamTruncated => "STREAM_TRUNCATED", DiagCode::RemoteFetchInterrupted => "REMOTE_FETCH_INTERRUPTED", DiagCode::RemoteNoRangeSupport => "REMOTE_NO_RANGE_SUPPORT", @@ -995,6 +1006,7 @@ impl DiagCode { | DiagCode::OcrBrokenVectorUnavailable | DiagCode::ImgSoftmaskUnsupported | DiagCode::ImgUnsupportedFormat + | DiagCode::ImgDeskewOutOfRange | DiagCode::StreamTruncated | DiagCode::RemoteNoRangeSupport | DiagCode::GstateStackOverflow @@ -1506,6 +1518,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "5.2.1", suggested_action: "Image format or bits-per-component not supported; image is skipped", }, + DiagInfo { + code: DiagCode::ImgDeskewOutOfRange, + category: "IMG", + severity: Severity::Warning, + recoverable: true, + phase: "5.3.1", + suggested_action: "Skew angle exceeds detection range (typically +/- 15 deg); image returned unchanged", + }, DiagInfo { code: DiagCode::StreamTruncated, category: "STREAM", diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 038812f..0167ea7 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -10,6 +10,8 @@ pub mod diagnostics; #[cfg(feature = "ocr")] pub mod dpi; pub mod document; +#[cfg(feature = "ocr")] +pub mod preprocess; pub mod extract; pub mod fingerprint; pub mod font; diff --git a/crates/pdftract-core/src/preprocess.rs b/crates/pdftract-core/src/preprocess.rs new file mode 100644 index 0000000..18b18be --- /dev/null +++ b/crates/pdftract-core/src/preprocess.rs @@ -0,0 +1,332 @@ +//! Image preprocessing pipeline (Phase 5.3). +//! +//! This module implements the preprocessing pipeline applied to raster images +//! before Tesseract OCR invocation. The pipeline is: +//! 1. **Deskew:** Hough line transform via pixDeskew; skip if angle < 0.3° +//! 2. **Contrast normalization:** Histogram stretch to [0, 255] +//! 3. **Binarization:** Sauvola (physical scans) or Otsu (digital) +//! 4. **Denoising:** 3×3 median filter +//! 5. **Border padding:** Add 10px white border +//! +//! # Feature Gate +//! +//! This module is only available when the `ocr` feature is enabled. + +#![cfg(feature = "ocr")] + +use crate::diagnostics::{Diagnostic, DiagCode}; +use image::{GrayImage, ImageBuffer, Luma}; +use std::ffi::c_float; + +/// Result type for preprocessing operations. +pub type Result = std::result::Result>; + +/// Minimum skew angle threshold in degrees. +/// +/// Skew angles below this threshold are considered negligible and the image +/// is returned unchanged. This avoids unnecessary rotation for near-level scans. +const DESKEW_THRESHOLD_DEG: f64 = 0.3; + +/// Maximum skew angle that pixDeskew can detect in degrees. +/// +/// Angles outside this range will be reported as "no skew found" and the +/// function returns the input unchanged. +const DESKEW_MAX_RANGE_DEG: f64 = 15.0; + +/// Deskew a grayscale image using leptonica's pixFindSkewAndDeskew (Hough transform). +/// +/// This function detects the dominant text angle in the image using a Hough +/// line transform. If the detected angle is >= 0.3 degrees, the image is +/// rotated by the negative of that angle to correct the skew. Otherwise, +/// the image is returned unchanged. +/// +/// # Arguments +/// +/// * `image` - Input grayscale image +/// +/// # Returns +/// +/// A tuple of (deskewed image, detected angle in degrees, diagnostics). +/// If no significant skew is detected, the original image is returned with angle = 0.0. +/// +/// # Critical considerations +/// +/// - **DO NOT pre-binarize** for skew detection — pixFindSkewAndDeskew works on any depth +/// - The detected angle is deterministic for the same input +/// - Rotation preserves aspect ratio and pads with white (no cropping) +/// - Performance: < 100 ms per 8.5x11 page at 300 DPI +/// +/// # Example +/// +/// ```ignore +/// use pdftract_core::preprocess::deskew; +/// use image::GrayImage; +/// +/// let original: GrayImage = // ... load image +/// let (deskewed, angle, diagnostics) = deskew(&original)?; +/// +/// if angle.abs() >= 0.3 { +/// println!("Deskewed by {} degrees", angle); +/// } else { +/// println!("No significant skew detected"); +/// } +/// ``` +pub fn deskew(image: &GrayImage) -> Result<(GrayImage, f64, Vec)> { + use leptonica_plumbing::leptonica_sys::{ + pixDestroy, pixFindSkewAndDeskew, pixGetWidth, pixGetHeight, pixGetDepth, + Pix, l_float32, l_int32, + }; + + let mut diagnostics = Vec::new(); + + // Convert GrayImage to leptonica Pix + let pix = grayimage_to_pix(image)?; + + // Call pixFindSkewAndDeskew to detect the skew angle and deskew + let (deskewed_pix, angle) = unsafe { + let mut angle: l_float32 = 0.0; + let mut conf: l_float32 = 0.0; + + // redsearch = 0 means use default reduction factor for binary search + // Returns deskewed pix if angle is significant, otherwise returns a clone + let result = pixFindSkewAndDeskew(pix, 0, &mut angle, &mut conf); + + if result.is_null() { + pixDestroy(pix); + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "pixFindSkewAndDeskew returned null", + )]; + return Err(diagnostics); + } + + let angle_deg = angle as f64; + + // Check if angle is below the threshold (function returns clone for small angles) + if angle_deg.abs() < DESKEW_THRESHOLD_DEG { + pixDestroy(result); + pixDestroy(pix); + return Ok((image.clone(), 0.0, diagnostics)); + } + + // Check if angle is within the expected detection range + // pixFindSkewAndDeskew typically searches within ±7 degrees by default + if angle_deg.abs() > DESKEW_MAX_RANGE_DEG { + pixDestroy(result); + pixDestroy(pix); + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::ImgDeskewOutOfRange, + format!("Skew angle {}° exceeds detection range (±{}°)", angle_deg, DESKEW_MAX_RANGE_DEG), + )); + return Ok((image.clone(), angle_deg, diagnostics)); + } + + (result, angle_deg) + }; + + // Convert back to GrayImage + let result_image = pix_to_grayimage(deskewed_pix)?; + + // Clean up + unsafe { + pixDestroy(deskewed_pix); + } + + Ok((result_image, angle, diagnostics)) +} + +/// Convert a GrayImage to a leptonica Pix. +/// +/// Creates an 8-bit grayscale Pix from the image data. +fn grayimage_to_pix(image: &GrayImage) -> Result<*mut Pix> { + use leptonica_plumbing::leptonica_sys::{ + pixCreate, pixDestroy, pixGetData, Pix, + }; + use std::ptr; + + let width = image.width() as i32; + let height = image.height() as i32; + const DEPTH: i32 = 8; + + unsafe { + let pix = pixCreate(width, height, DEPTH); + + if pix.is_null() { + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "Failed to create leptonica Pix for deskew", + )]; + return Err(diagnostics); + } + + // Get the data pointer from the Pix + let pix_data = pixGetData(pix); + + if pix_data.is_null() { + pixDestroy(pix); + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "Failed to get pixel data pointer from Pix", + )]; + return Err(diagnostics); + } + + // Copy pixel data from GrayImage to Pix + // Pix stores data as l_uint32* (4-byte words), but for 8 bpp each pixel is one byte + let raw_data = image.as_raw(); + let len = raw_data.len(); + + // Copy byte by byte + for i in 0..len { + *pix_data.add(i) = raw_data[i] as u32; + } + + Ok(pix) + } +} + +/// Convert a leptonica Pix to a GrayImage. +/// +/// Expects an 8-bit grayscale Pix. +fn pix_to_grayimage(pix: *mut Pix) -> Result { + use leptonica_plumbing::leptonica_sys::{ + pixGetData, pixGetWidth, pixGetHeight, pixGetDepth, Pix, + }; + + unsafe { + if pix.is_null() { + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "Null Pix pointer in pix_to_grayimage", + )]; + return Err(diagnostics); + } + + let width = pixGetWidth(pix) as u32; + let height = pixGetHeight(pix) as u32; + let depth = pixGetDepth(pix) as u32; + + if depth != 8 { + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + format!("Unsupported Pix depth {} (expected 8)", depth), + )]; + return Err(diagnostics); + } + + let data_ptr = pixGetData(pix); + + if data_ptr.is_null() { + let diagnostics = vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "Null data pointer in Pix", + )]; + return Err(diagnostics); + } + + // Copy the pixel data into a GrayImage + let len = (width * height) as usize; + let mut buffer = Vec::with_capacity(len); + + // Copy pixel data (stored as u32 but each pixel is 1 byte for 8 bpp) + for i in 0..len { + buffer.push(*data_ptr.add(i) as u8); + } + + GrayImage::from_raw(width, height, buffer).ok_or_else(|| { + vec![Diagnostic::with_static_no_offset( + DiagCode::ImgUnsupportedFormat, + "Failed to create GrayImage from Pix data", + )] + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Create a simple test pattern with horizontal lines. + fn create_horizontal_lines_image() -> GrayImage { + let mut img = GrayImage::new(200, 100); + for y in 0..100 { + for x in 0..200 { + let pixel = if y % 10 < 5 { 0 } else { 255 }; + img.put_pixel(x, y, Luma([pixel])); + } + } + img + } + + /// Create a simple test pattern with vertical lines. + fn create_vertical_lines_image() -> GrayImage { + let mut img = GrayImage::new(100, 200); + for y in 0..200 { + for x in 0..100 { + let pixel = if x % 10 < 5 { 0 } else { 255 }; + img.put_pixel(x, y, Luma([pixel])); + } + } + img + } + + /// Create a solid white image. + fn create_white_image() -> GrayImage { + GrayImage::from_pixel(200, 100, Luma([255])) + } + + #[test] + fn test_deskew_horizontal_lines() { + // Horizontal lines should have 0° skew + let img = create_horizontal_lines_image(); + let (deskewed, angle, diagnostics) = deskew(&img).expect("Deskew failed"); + + assert!(angle.abs() < 0.1, "Angle should be near 0°, got {}", angle); + assert!(!diagnostics.iter().any(|d| d.code == DiagCode::ImgDeskewOutOfRange)); + } + + #[test] + fn test_deskew_white_image() { + // White image should have no detectable skew + let img = create_white_image(); + let (deskewed, angle, diagnostics) = deskew(&img).expect("Deskew failed"); + + assert_eq!(angle, 0.0, "Angle should be exactly 0° for white image"); + assert!(diagnostics.is_empty()); + } + + #[test] + fn test_grayimage_to_pix_roundtrip() { + let img = create_horizontal_lines_image(); + let pix = grayimage_to_pix(&img).expect("Failed to convert to Pix"); + + // Check that the Pix was created successfully + unsafe { + use leptonica_plumbing::leptonica_sys::{pixGetWidth, pixGetHeight, pixGetDepth, pixDestroy}; + + assert!(!pix.is_null(), "Pix pointer should not be null"); + assert_eq!(pixGetWidth(pix) as u32, img.width()); + assert_eq!(pixGetHeight(pix) as u32, img.height()); + assert_eq!(pixGetDepth(pix) as u32, 8); + + pixDestroy(pix); + } + } + + #[test] + fn test_pix_to_grayimage_roundtrip() { + let img = create_horizontal_lines_image(); + let pix = grayimage_to_pix(&img).expect("Failed to convert to Pix"); + + let converted = pix_to_grayimage(pix).expect("Failed to convert back"); + + // Clean up + unsafe { + use leptonica_plumbing::leptonica_sys::pixDestroy; + pixDestroy(pix); + } + + assert_eq!(converted.width(), img.width()); + assert_eq!(converted.height(), img.height()); + } +}