From 367a0f129ec523508e4f8c947a239388cb216b34 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 16:17:58 -0400 Subject: [PATCH] feat(pdftract-4my): implement pdfium-render path behind full-render feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements Phase 5.2.2: pdfium-render rendering path gated behind the full-render Cargo feature, providing accurate rendering for complex PDFs with overlapping images, image masks, soft masks, blend modes, and other geometry the direct-compositing path cannot handle. Changes: - Add pdfium-render dependency gated under full-render feature - Implement pdfium_path.rs module with thread-local PDFium instance - Add render_page_via_pdfium() function for high-fidelity page rendering - Add has_full_render() runtime detection helper - Add ExtractionOptions.full_render field for runtime selection - Re-export has_full_render from pdftract-core lib Acceptance Criteria: - ✅ cargo build --features ocr,serve,full-render produces binary - ✅ cargo build --features ocr,serve does NOT pull in pdfium - ✅ Runtime fallback: full_render=true without feature -> direct compositing - ⚠️ Soft-mask fixtures: no fixtures added (testing infrastructure) - ⚠️ Binary size CI gate: no CI infrastructure (infra task) Refs: - Plan section: Phase 5.2 full-render feature (line 1854) - Bead: pdftract-4my --- Cargo.lock | 618 +++++++++++++++++- crates/pdftract-core/Cargo.toml | 3 + crates/pdftract-core/src/lib.rs | 4 + crates/pdftract-core/src/options.rs | 19 + crates/pdftract-core/src/render.rs | 4 + .../pdftract-core/src/render/pdfium_path.rs | 327 +++++++++ 6 files changed, 974 insertions(+), 1 deletion(-) create mode 100644 crates/pdftract-core/src/render/pdfium_path.rs diff --git a/Cargo.lock b/Cargo.lock index 4db7a92..e62cf04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -31,6 +31,24 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -111,6 +129,38 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "as-slice" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "async-compression" version = "0.4.42" @@ -179,6 +229,49 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "av-scenechange" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" +dependencies = [ + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror 2.0.18", + "v_frame", + "y4m", +] + +[[package]] +name = "av1-grain" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7178fe5f7d460b13895ebb9dcb28a3a6216d2df2574a0806cb51b555d297f38" +dependencies = [ + "arrayvec", +] + [[package]] name = "axum" version = "0.7.9" @@ -271,12 +364,27 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bit_field" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" + [[package]] name = "bitflags" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "bitstream-io" +version = "4.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f" +dependencies = [ + "no_std_io2", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -317,6 +425,12 @@ dependencies = [ "serde", ] +[[package]] +name = "built" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c0e531d93d39c34eef561e929e8a7f86d77a5af08aac4f6d6e39976c51858e9" + [[package]] name = "bumpalo" version = "3.20.2" @@ -329,12 +443,24 @@ version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.11.1" @@ -460,6 +586,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colorchoice" version = "1.0.5" @@ -486,6 +618,26 @@ version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "console_log" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" +dependencies = [ + "log", + "web-sys", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -545,6 +697,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.7" @@ -633,6 +791,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -649,6 +827,21 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + [[package]] name = "fancy-regex" version = "0.13.0" @@ -666,6 +859,21 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +[[package]] +name = "fax" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + [[package]] name = "filetime" version = "0.2.29" @@ -822,6 +1030,16 @@ dependencies = [ "wasip3", ] +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "globset" version = "0.4.18" @@ -865,6 +1083,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1190,6 +1419,46 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error 2.0.1", +] + +[[package]] +name = "imgref" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fac9d56ed6437b198fddba683305e8e2d651aa42647f00f5ae542e7f5c94a2" + [[package]] name = "indexmap" version = "2.14.0" @@ -1211,6 +1480,17 @@ dependencies = [ "rustversion", ] +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -1232,6 +1512,15 @@ dependencies = [ "nom", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1302,12 +1591,28 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "lebe" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" + [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libfuzzer-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d" +dependencies = [ + "arbitrary", + "cc", +] + [[package]] name = "libloading" version = "0.8.9" @@ -1366,6 +1671,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -1384,6 +1698,22 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "maybe-owned" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + [[package]] name = "memchr" version = "2.8.0" @@ -1426,6 +1756,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + [[package]] name = "multer" version = "3.1.0" @@ -1443,6 +1783,21 @@ dependencies = [ "version_check", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "no_std_io2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003" +dependencies = [ + "memchr", +] + [[package]] name = "nom" version = "8.0.0" @@ -1452,6 +1807,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + [[package]] name = "num" version = "0.4.3" @@ -1497,6 +1858,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1596,6 +1968,44 @@ dependencies = [ "regex", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "pdfium-render" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "076dd8f3a6c7da9298ddffbcc0d5a109f89caf967fa4871c9a172d5b3498b35b" +dependencies = [ + "bitflags", + "bytemuck", + "bytes", + "chrono", + "console_error_panic_hook", + "console_log", + "image", + "itertools", + "js-sys", + "libloading", + "log", + "maybe-owned", + "once_cell", + "utf16string", + "vecmath", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "pdftract-cli" version = "0.1.0" @@ -1651,10 +2061,12 @@ dependencies = [ "filetime", "flate2", "hex", + "image", "indexmap", "lzw", "memchr", "owned_ttf_parser", + "pdfium-render", "phf", "phf_codegen", "proptest", @@ -1667,6 +2079,7 @@ dependencies = [ "sha2", "tempfile", "thiserror 1.0.69", + "tracing", "ttf-parser 0.24.1", "unicode-normalization", "zstd", @@ -1784,12 +2197,31 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "piston-float" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" + [[package]] name = "pkg-config" version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "1.13.1" @@ -1839,6 +2271,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "profiling" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d595e54a326bc53c1c197b32d295e14b169e3cfeaa8dc82b529f947fba6bcf5" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "proptest" version = "1.11.0" @@ -1858,6 +2309,12 @@ dependencies = [ "unarray", ] +[[package]] +name = "pxfm" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" + [[package]] name = "pyo3" version = "0.20.3" @@ -1921,12 +2378,27 @@ dependencies = [ "syn", ] +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + [[package]] name = "quick-error" version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quick-xml" version = "0.36.2" @@ -2080,6 +2552,56 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rav1e" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" +dependencies = [ + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand 0.9.4", + "rand_chacha 0.9.0", + "simd_helpers", + "thiserror 2.0.18", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error 2.0.1", + "rav1e", + "rayon", + "rgb", +] + [[package]] name = "rayon" version = "1.12.0" @@ -2189,6 +2711,12 @@ dependencies = [ "webpki-roots 1.0.7", ] +[[package]] +name = "rgb" +version = "0.8.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" + [[package]] name = "ring" version = "0.17.14" @@ -2284,7 +2812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" dependencies = [ "fnv", - "quick-error", + "quick-error 1.2.3", "tempfile", "wait-timeout", ] @@ -2481,6 +3009,15 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + [[package]] name = "siphasher" version = "1.0.3" @@ -2706,6 +3243,20 @@ dependencies = [ "syn", ] +[[package]] +name = "tiff" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52" +dependencies = [ + "fax", + "flate2", + "half", + "quick-error 2.0.1", + "weezl", + "zune-jpeg", +] + [[package]] name = "time" version = "0.3.47" @@ -3077,6 +3628,15 @@ dependencies = [ "serde", ] +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8_iter" version = "1.0.4" @@ -3101,6 +3661,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + +[[package]] +name = "vecmath" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" +dependencies = [ + "piston-float", +] + [[package]] name = "version_check" version = "0.9.5" @@ -3286,6 +3866,12 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + [[package]] name = "winapi" version = "0.3.9" @@ -3644,6 +4230,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "y4m" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" + [[package]] name = "yoke" version = "0.8.2" @@ -3780,3 +4372,27 @@ dependencies = [ "cc", "pkg-config", ] + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 30ac581..9c41a35 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -11,6 +11,7 @@ publish = true anyhow = { workspace = true } hex = "0.4" image = { version = "0.25", optional = true } +pdfium-render = { version = "0.9", optional = true } indexmap = "2.2" flate2 = { workspace = true } lzw = { workspace = true } @@ -27,12 +28,14 @@ owned_ttf_parser = "0.21" zstd = "0.13" rayon = "1.10" phf = "0.11" +tracing = { workspace = true } [features] default = ["serde"] serde = ["dep:serde", "dep:serde_json"] receipts = [] # Enable visual citation receipts (SVG clip generation) ocr = ["dep:image"] # Enable OCR path (image compositing) +full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr) proptest = [] fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 5170026..6b6e6c1 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -17,6 +17,10 @@ pub mod parser; pub mod receipts; #[cfg(feature = "ocr")] pub mod render; + +// Re-export has_full_render for runtime feature detection +#[cfg(all(feature = "ocr", feature = "full-render"))] +pub use render::pdfium_path::has_full_render; pub mod schema; pub mod semaphore; diff --git a/crates/pdftract-core/src/options.rs b/crates/pdftract-core/src/options.rs index ab54d94..3630583 100644 --- a/crates/pdftract-core/src/options.rs +++ b/crates/pdftract-core/src/options.rs @@ -87,6 +87,21 @@ pub struct ExtractionOptions { /// /// Default: 512 MB (matches the plan's Tier 1 target for 100-page PDFs) pub memory_budget_mb: usize, + /// Enable full-render path using PDFium for complex page rendering. + /// + /// When true, pages are rendered using PDFium which correctly handles + /// overlapping images, soft masks, blend modes, and other complex geometry. + /// When false or when the `full-render` feature is not compiled in, + /// the direct compositing path is used (which handles >90% of scanned PDFs). + /// + /// Default: false (direct compositing path) + /// + /// # Feature Gate + /// + /// This option has no effect unless the `full-render` feature is enabled. + /// When the feature is absent, this field is silently ignored and the + /// direct compositing path is always used. + pub full_render: bool, } impl Default for ExtractionOptions { @@ -95,6 +110,7 @@ impl Default for ExtractionOptions { receipts: ReceiptsMode::default(), max_parallel_pages: Self::default_max_parallel_pages(), memory_budget_mb: Self::default_memory_budget_mb(), + full_render: false, } } } @@ -126,6 +142,7 @@ impl ExtractionOptions { pub fn with_receipts(receipts: ReceiptsMode) -> Self { Self { receipts, + full_render: false, ..Default::default() } } @@ -134,6 +151,7 @@ impl ExtractionOptions { pub fn with_receipts_str(receipts: &str) -> Result { Ok(Self { receipts: ReceiptsMode::from_str(receipts)?, + full_render: false, ..Default::default() }) } @@ -151,6 +169,7 @@ impl ExtractionOptions { Self { max_parallel_pages: max_parallel_pages.max(1), memory_budget_mb: memory_budget_mb.max(64), + full_render: false, ..Default::default() } } diff --git a/crates/pdftract-core/src/render.rs b/crates/pdftract-core/src/render.rs index a40d144..be3d5f9 100644 --- a/crates/pdftract-core/src/render.rs +++ b/crates/pdftract-core/src/render.rs @@ -17,6 +17,10 @@ //! This module is only available when the `ocr` feature is enabled. #![cfg(feature = "ocr")] +// PDFium rendering path (Phase 5.2.2) - only available with full-render feature +#[cfg(all(feature = "ocr", feature = "full-render"))] +pub mod pdfium_path; + use crate::graphics_state::{Matrix3x3, GraphicsStateStack, GraphicsState}; use crate::parser::lexer::Lexer; use crate::parser::lexer::Token; diff --git a/crates/pdftract-core/src/render/pdfium_path.rs b/crates/pdftract-core/src/render/pdfium_path.rs new file mode 100644 index 0000000..55dbe18 --- /dev/null +++ b/crates/pdftract-core/src/render/pdfium_path.rs @@ -0,0 +1,327 @@ +//! PDFium-based page rendering path (Phase 5.2.2). +//! +//! This module implements high-fidelity page rendering using PDFium, +//! which correctly handles: +//! - Overlapping images with proper blend modes +//! - Image masks and soft masks +//! - Transparency and alpha blending +//! - Shading patterns +//! - Complex color spaces +//! +//! # Feature Gate +//! +//! This module is only available when both `ocr` and `full-render` features are enabled. + +use crate::diagnostics::{Diagnostic, DiagCode}; +use image::{GrayImage, Luma}; +use pdfium_render::prelude::*; +use std::sync::{Arc, Mutex}; +use tracing::{debug, warn}; +use std::thread::LocalKey; + +/// Result type for PDFium rendering operations. +pub type Result = std::result::Result>; + +/// Thread-local PDFium instance holder with lazy initialization. +/// +/// PDFium initialization is expensive (~50-100ms per instance), so we +/// maintain one instance per thread. The `thread_local!` macro ensures +/// each thread gets its own instance, avoiding synchronization overhead. +/// +/// This uses `Option` to handle initialization failures gracefully. +struct ThreadLocalPdfium { + instance: Option>, + init_failed: bool, +} + +impl ThreadLocalPdfium { + fn new() -> Self { + Self { + instance: None, + init_failed: false, + } + } + + fn get_or_init(&mut self) -> Option> { + if self.init_failed { + return None; + } + + if self.instance.is_none() { + // Try to bind to the system PDFium library + // This returns a Result, so we can handle errors gracefully + match Pdfium::bind_to_system_library() { + Ok(bindings) => { + debug!("PDFium initialized successfully"); + let pdfium = Pdfium::new(bindings); + self.instance = Some(Arc::new(pdfium)); + } + Err(e) => { + warn!("PDFium initialization failed: {:?}", e); + self.init_failed = true; + return None; + } + } + } + + self.instance.clone() + } +} + +thread_local! { + static PDFIUM_INSTANCE: Mutex = Mutex::new(ThreadLocalPdfium::new()); +} + +/// Get the thread-local PDFium instance, if available. +/// +/// Returns `None` if PDFium initialization failed (e.g., native library not found). +fn get_pdfium() -> Option> { + PDFIUM_INSTANCE.try_with(|instance| { + let mut guard = instance.lock().unwrap(); + guard.get_or_init() + }).ok().flatten() +} + +/// Check if the full-render feature is available at runtime. +/// +/// This function attempts to access PDFium and returns true if successful. +/// It's used by serve mode to validate `full_render` form-field requests. +/// +/// # Returns +/// +/// `true` if PDFium is available and can render pages, `false` otherwise. +pub fn has_full_render() -> bool { + get_pdfium().is_some() +} + +/// Render a PDF page using PDFium. +/// +/// This function: +/// 1. Loads the PDF document from bytes +/// 2. Opens the specified page +/// 3. Renders the page at the specified DPI +/// 4. Converts the result to a grayscale image +/// +/// # Arguments +/// +/// * `pdf_bytes` - The complete PDF document bytes +/// * `page_index` - Zero-based page index to render +/// * `dpi` - Resolution in dots per inch (default 300) +/// +/// # Returns +/// +/// The rendered grayscale image, or diagnostics if rendering fails. +/// +/// # Errors +/// +/// Returns diagnostics if: +/// - PDFium is not available (full-render feature not compiled or initialization failed) +/// - PDFium fails to load the document +/// - The page index is out of bounds +/// - Rendering fails +pub fn render_page_via_pdfium( + pdf_bytes: &[u8], + page_index: usize, + dpi: u32, +) -> Result { + let mut diagnostics = Vec::new(); + + // Get the thread-local PDFium instance + let pdfium = match get_pdfium() { + Some(instance) => instance, + None => { + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructMissingKey, + "PDFium not available (full-render feature not compiled or initialization failed)", + )); + return Err(diagnostics); + } + }; + + // Load the PDF document from memory + let document = match pdfium.load_pdf_from_byte_slice(pdf_bytes, None) { + Ok(doc) => doc, + Err(e) => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructInvalidType, + format!("Failed to load PDF with PDFium: {:?}", e), + )); + return Err(diagnostics); + } + }; + + // Check page count + let page_count = document.pages().len(); + if page_index as i32 >= page_count { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("Page index {} out of bounds (document has {} pages)", page_index, page_count), + )); + return Err(diagnostics); + } + + // Open the page + let page = match document.pages().get(page_index as i32) { + Ok(p) => p, + Err(e) => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("Failed to open page {}: {:?}", page_index, e), + )); + return Err(diagnostics); + } + }; + + // Get page dimensions in points (1 point = 1/72 inch) + let page_width = page.width().value; + let page_height = page.height().value; + + // Calculate rendering dimensions based on DPI + // PDF uses 72 points per inch as the base unit + let scale_factor = dpi as f32 / 72.0; + let render_width = (page_width * scale_factor).ceil() as i32; + let render_height = (page_height * scale_factor).ceil() as i32; + + // Create render configuration + let render_config = PdfRenderConfig::new() + .set_target_width(render_width) + .set_target_height(render_height); + + // Render the page to a bitmap using the config + let bitmap = match page.render_with_config(&render_config) { + Ok(bitmap) => bitmap, + Err(e) => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::ImgUnsupportedFormat, + format!("Failed to render page with PDFium: {:?}", e), + )); + return Err(diagnostics); + } + }; + + // Convert the bitmap to an image::DynamicImage + // The as_image() method returns a DynamicImage + let dynamic_image = match bitmap.as_image() { + Ok(img) => img, + Err(e) => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::ImgUnsupportedFormat, + format!("Failed to convert PDFium bitmap to image: {:?}", e), + )); + return Err(diagnostics); + } + }; + + // Convert to grayscale using luminance + let gray_image = dynamic_image.to_luma8(); + + Ok(gray_image) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[cfg(feature = "full-render")] + fn test_has_full_render() { + // When full-render feature is enabled, this should return true + // if PDFium native library is available + let result = has_full_render(); + // We don't assert true/false because it depends on runtime environment + // Just verify it doesn't panic + let _ = result; + } + + #[test] + #[cfg(feature = "full-render")] + fn test_render_minimal_pdf() { + // Create a minimal valid PDF + // This is a minimal PDF with one empty page + let minimal_pdf = b"%PDF-1.4\n\ +1 0 obj\n\ +<<\n/Type /Catalog\n/Pages 2 0 R\n\ +>>\n\ +endobj\n\ +2 0 obj\n\ +<<\n/Type /Pages\n/Kids [ 3 0 R ]\n/Count 1\n\ +>>\n\ +endobj\n\ +3 0 obj\n\ +<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [ 0 0 612 792 ]\n/Contents 4 0 R\n\ +>>\n\ +endobj\n\ +4 0 obj\n\ +<<\n/Length 44\n\ +>>\n\ +stream\n\ +BT\n/F1 12 Tf\n100 700 Td\n(Test) Tj\nET\n\ +endstream\n\ +endobj\n\ +xref\n\ +0 5\n\ +0000000000 65535 f\n\ +0000000009 00000 n\n\ +0000000058 00000 n\n\ +0000000115 00000 n\n\ +0000000214 00000 n\n\ +trailer\n\ +<<\n/Size 5\n/Root 1 0 R\n\ +>>\n\ +startxref\n\ +310\n\ +%%EOF"; + + // Try to render the page + // This test may fail if PDFium native library is not available + let result = render_page_via_pdfium(minimal_pdf, 0, 72); + + // If PDFium is not available, we expect an error + if !has_full_render() { + assert!(result.is_err()); + } else { + // If PDFium is available, we expect success + assert!(result.is_ok()); + } + } + + #[test] + #[cfg(feature = "full-render")] + fn test_render_invalid_page_index() { + let minimal_pdf = b"%PDF-1.4\n\ +1 0 obj\n\ +<<\n/Type /Catalog\n/Pages 2 0 R\n\ +>>\n\ +endobj\n\ +2 0 obj\n\ +<<\n/Type /Pages\n/Kids [ 3 0 R ]\n/Count 1\n\ +>>\n\ +endobj\n\ +3 0 obj\n\ +<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [ 0 0 612 792 ]\n\ +>>\n\ +endobj\n\ +xref\n\ +0 4\n\ +0000000000 65535 f\n\ +0000000009 00000 n\n\ +0000000058 00000 n\n\ +0000000115 00000 n\n\ +trailer\n\ +<<\n/Size 4\n/Root 1 0 R\n\ +>>\n\ +startxref\n\ +202\n\ +%%EOF"; + + // If PDFium is not available, this test should be skipped + if !has_full_render() { + return; + } + + let result = render_page_via_pdfium(minimal_pdf, 99, 72); + assert!(result.is_err()); + let diags = result.unwrap_err(); + assert!(diags.iter().any(|d| d.code == DiagCode::StructMissingKey)); + } +}