feat(pdftract-4my): implement pdfium-render path behind full-render feature

Implements Phase 5.2.2: pdfium-render rendering path gated behind the
full-render Cargo feature, providing accurate rendering for complex PDFs
with overlapping images, image masks, soft masks, blend modes, and other
geometry the direct-compositing path cannot handle.

Changes:
- Add pdfium-render dependency gated under full-render feature
- Implement pdfium_path.rs module with thread-local PDFium instance
- Add render_page_via_pdfium() function for high-fidelity page rendering
- Add has_full_render() runtime detection helper
- Add ExtractionOptions.full_render field for runtime selection
- Re-export has_full_render from pdftract-core lib

Acceptance Criteria:
-  cargo build --features ocr,serve,full-render produces binary
-  cargo build --features ocr,serve does NOT pull in pdfium
-  Runtime fallback: full_render=true without feature -> direct compositing
- ⚠️ Soft-mask fixtures: no fixtures added (testing infrastructure)
- ⚠️ Binary size CI gate: no CI infrastructure (infra task)

Refs:
- Plan section: Phase 5.2 full-render feature (line 1854)
- Bead: pdftract-4my
This commit is contained in:
jedarden 2026-05-23 16:17:58 -04:00
parent 50946fc98c
commit 367a0f129e
6 changed files with 974 additions and 1 deletions

618
Cargo.lock generated
View file

@ -31,6 +31,24 @@ dependencies = [
"memchr",
]
[[package]]
name = "aligned"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685"
dependencies = [
"as-slice",
]
[[package]]
name = "aligned-vec"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b"
dependencies = [
"equator",
]
[[package]]
name = "alloc-no-stdlib"
version = "2.0.4"
@ -111,6 +129,38 @@ version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
[[package]]
name = "arg_enum_proc_macro"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "as-slice"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "async-compression"
version = "0.4.42"
@ -179,6 +229,49 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "av-scenechange"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394"
dependencies = [
"aligned",
"anyhow",
"arg_enum_proc_macro",
"arrayvec",
"log",
"num-rational",
"num-traits",
"pastey",
"rayon",
"thiserror 2.0.18",
"v_frame",
"y4m",
]
[[package]]
name = "av1-grain"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8"
dependencies = [
"anyhow",
"arrayvec",
"log",
"nom",
"num-rational",
"v_frame",
]
[[package]]
name = "avif-serialize"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7178fe5f7d460b13895ebb9dcb28a3a6216d2df2574a0806cb51b555d297f38"
dependencies = [
"arrayvec",
]
[[package]]
name = "axum"
version = "0.7.9"
@ -271,12 +364,27 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bit_field"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6"
[[package]]
name = "bitflags"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
[[package]]
name = "bitstream-io"
version = "4.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f"
dependencies = [
"no_std_io2",
]
[[package]]
name = "block-buffer"
version = "0.10.4"
@ -317,6 +425,12 @@ dependencies = [
"serde",
]
[[package]]
name = "built"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c0e531d93d39c34eef561e929e8a7f86d77a5af08aac4f6d6e39976c51858e9"
[[package]]
name = "bumpalo"
version = "3.20.2"
@ -329,12 +443,24 @@ version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
[[package]]
name = "bytemuck"
version = "1.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "byteorder-lite"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
[[package]]
name = "bytes"
version = "1.11.1"
@ -460,6 +586,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
[[package]]
name = "color_quant"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
[[package]]
name = "colorchoice"
version = "1.0.5"
@ -486,6 +618,26 @@ version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789"
[[package]]
name = "console_error_panic_hook"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
dependencies = [
"cfg-if",
"wasm-bindgen",
]
[[package]]
name = "console_log"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f"
dependencies = [
"log",
"web-sys",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
@ -545,6 +697,12 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
[[package]]
name = "crypto-common"
version = "0.1.7"
@ -633,6 +791,26 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "equator"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc"
dependencies = [
"equator-macro",
]
[[package]]
name = "equator-macro"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "equivalent"
version = "1.0.2"
@ -649,6 +827,21 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "exr"
version = "1.74.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be"
dependencies = [
"bit_field",
"half",
"lebe",
"miniz_oxide",
"rayon-core",
"smallvec",
"zune-inflate",
]
[[package]]
name = "fancy-regex"
version = "0.13.0"
@ -666,6 +859,21 @@ version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
[[package]]
name = "fax"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a"
[[package]]
name = "fdeflate"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
dependencies = [
"simd-adler32",
]
[[package]]
name = "filetime"
version = "0.2.29"
@ -822,6 +1030,16 @@ dependencies = [
"wasip3",
]
[[package]]
name = "gif"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159"
dependencies = [
"color_quant",
"weezl",
]
[[package]]
name = "globset"
version = "0.4.18"
@ -865,6 +1083,17 @@ dependencies = [
"tracing",
]
[[package]]
name = "half"
version = "2.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
dependencies = [
"cfg-if",
"crunchy",
"zerocopy",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
@ -1190,6 +1419,46 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "image"
version = "0.25.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104"
dependencies = [
"bytemuck",
"byteorder-lite",
"color_quant",
"exr",
"gif",
"image-webp",
"moxcms",
"num-traits",
"png",
"qoi",
"ravif",
"rayon",
"rgb",
"tiff",
"zune-core",
"zune-jpeg",
]
[[package]]
name = "image-webp"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3"
dependencies = [
"byteorder-lite",
"quick-error 2.0.1",
]
[[package]]
name = "imgref"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40fac9d56ed6437b198fddba683305e8e2d651aa42647f00f5ae542e7f5c94a2"
[[package]]
name = "indexmap"
version = "2.14.0"
@ -1211,6 +1480,17 @@ dependencies = [
"rustversion",
]
[[package]]
name = "interpolate_name"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "ipnet"
version = "2.12.0"
@ -1232,6 +1512,15 @@ dependencies = [
"nom",
]
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.18"
@ -1302,12 +1591,28 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "lebe"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
[[package]]
name = "libc"
version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libfuzzer-sys"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d"
dependencies = [
"arbitrary",
"cc",
]
[[package]]
name = "libloading"
version = "0.8.9"
@ -1366,6 +1671,15 @@ version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "loop9"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062"
dependencies = [
"imgref",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@ -1384,6 +1698,22 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
[[package]]
name = "maybe-owned"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
[[package]]
name = "maybe-rayon"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519"
dependencies = [
"cfg-if",
"rayon",
]
[[package]]
name = "memchr"
version = "2.8.0"
@ -1426,6 +1756,16 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "moxcms"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b"
dependencies = [
"num-traits",
"pxfm",
]
[[package]]
name = "multer"
version = "3.1.0"
@ -1443,6 +1783,21 @@ dependencies = [
"version_check",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "no_std_io2"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
dependencies = [
"memchr",
]
[[package]]
name = "nom"
version = "8.0.0"
@ -1452,6 +1807,12 @@ dependencies = [
"memchr",
]
[[package]]
name = "noop_proc_macro"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8"
[[package]]
name = "num"
version = "0.4.3"
@ -1497,6 +1858,17 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441"
[[package]]
name = "num-derive"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "num-integer"
version = "0.1.46"
@ -1596,6 +1968,44 @@ dependencies = [
"regex",
]
[[package]]
name = "paste"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "pastey"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec"
[[package]]
name = "pdfium-render"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "076dd8f3a6c7da9298ddffbcc0d5a109f89caf967fa4871c9a172d5b3498b35b"
dependencies = [
"bitflags",
"bytemuck",
"bytes",
"chrono",
"console_error_panic_hook",
"console_log",
"image",
"itertools",
"js-sys",
"libloading",
"log",
"maybe-owned",
"once_cell",
"utf16string",
"vecmath",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "pdftract-cli"
version = "0.1.0"
@ -1651,10 +2061,12 @@ dependencies = [
"filetime",
"flate2",
"hex",
"image",
"indexmap",
"lzw",
"memchr",
"owned_ttf_parser",
"pdfium-render",
"phf",
"phf_codegen",
"proptest",
@ -1667,6 +2079,7 @@ dependencies = [
"sha2",
"tempfile",
"thiserror 1.0.69",
"tracing",
"ttf-parser 0.24.1",
"unicode-normalization",
"zstd",
@ -1784,12 +2197,31 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "piston-float"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
[[package]]
name = "pkg-config"
version = "0.3.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
[[package]]
name = "png"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
dependencies = [
"bitflags",
"crc32fast",
"fdeflate",
"flate2",
"miniz_oxide",
]
[[package]]
name = "portable-atomic"
version = "1.13.1"
@ -1839,6 +2271,25 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "profiling"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d595e54a326bc53c1c197b32d295e14b169e3cfeaa8dc82b529f947fba6bcf5"
dependencies = [
"profiling-procmacros",
]
[[package]]
name = "profiling-procmacros"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "proptest"
version = "1.11.0"
@ -1858,6 +2309,12 @@ dependencies = [
"unarray",
]
[[package]]
name = "pxfm"
version = "0.1.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
[[package]]
name = "pyo3"
version = "0.20.3"
@ -1921,12 +2378,27 @@ dependencies = [
"syn",
]
[[package]]
name = "qoi"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001"
dependencies = [
"bytemuck",
]
[[package]]
name = "quick-error"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quick-error"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
[[package]]
name = "quick-xml"
version = "0.36.2"
@ -2080,6 +2552,56 @@ dependencies = [
"rand_core 0.9.5",
]
[[package]]
name = "rav1e"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b"
dependencies = [
"aligned-vec",
"arbitrary",
"arg_enum_proc_macro",
"arrayvec",
"av-scenechange",
"av1-grain",
"bitstream-io",
"built",
"cfg-if",
"interpolate_name",
"itertools",
"libc",
"libfuzzer-sys",
"log",
"maybe-rayon",
"new_debug_unreachable",
"noop_proc_macro",
"num-derive",
"num-traits",
"paste",
"profiling",
"rand 0.9.4",
"rand_chacha 0.9.0",
"simd_helpers",
"thiserror 2.0.18",
"v_frame",
"wasm-bindgen",
]
[[package]]
name = "ravif"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45"
dependencies = [
"avif-serialize",
"imgref",
"loop9",
"quick-error 2.0.1",
"rav1e",
"rayon",
"rgb",
]
[[package]]
name = "rayon"
version = "1.12.0"
@ -2189,6 +2711,12 @@ dependencies = [
"webpki-roots 1.0.7",
]
[[package]]
name = "rgb"
version = "0.8.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4"
[[package]]
name = "ring"
version = "0.17.14"
@ -2284,7 +2812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2"
dependencies = [
"fnv",
"quick-error",
"quick-error 1.2.3",
"tempfile",
"wait-timeout",
]
@ -2481,6 +3009,15 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
[[package]]
name = "simd_helpers"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6"
dependencies = [
"quote",
]
[[package]]
name = "siphasher"
version = "1.0.3"
@ -2706,6 +3243,20 @@ dependencies = [
"syn",
]
[[package]]
name = "tiff"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52"
dependencies = [
"fax",
"flate2",
"half",
"quick-error 2.0.1",
"weezl",
"zune-jpeg",
]
[[package]]
name = "time"
version = "0.3.47"
@ -3077,6 +3628,15 @@ dependencies = [
"serde",
]
[[package]]
name = "utf16string"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
dependencies = [
"byteorder",
]
[[package]]
name = "utf8_iter"
version = "1.0.4"
@ -3101,6 +3661,26 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "v_frame"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2"
dependencies = [
"aligned-vec",
"num-traits",
"wasm-bindgen",
]
[[package]]
name = "vecmath"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a"
dependencies = [
"piston-float",
]
[[package]]
name = "version_check"
version = "0.9.5"
@ -3286,6 +3866,12 @@ dependencies = [
"rustls-pki-types",
]
[[package]]
name = "weezl"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
[[package]]
name = "winapi"
version = "0.3.9"
@ -3644,6 +4230,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
[[package]]
name = "y4m"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448"
[[package]]
name = "yoke"
version = "0.8.2"
@ -3780,3 +4372,27 @@ dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "zune-core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
[[package]]
name = "zune-inflate"
version = "0.2.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02"
dependencies = [
"simd-adler32",
]
[[package]]
name = "zune-jpeg"
version = "0.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296"
dependencies = [
"zune-core",
]

View file

@ -11,6 +11,7 @@ publish = true
anyhow = { workspace = true }
hex = "0.4"
image = { version = "0.25", optional = true }
pdfium-render = { version = "0.9", optional = true }
indexmap = "2.2"
flate2 = { workspace = true }
lzw = { workspace = true }
@ -27,12 +28,14 @@ owned_ttf_parser = "0.21"
zstd = "0.13"
rayon = "1.10"
phf = "0.11"
tracing = { workspace = true }
[features]
default = ["serde"]
serde = ["dep:serde", "dep:serde_json"]
receipts = [] # Enable visual citation receipts (SVG clip generation)
ocr = ["dep:image"] # Enable OCR path (image compositing)
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)
proptest = []
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses

View file

@ -17,6 +17,10 @@ pub mod parser;
pub mod receipts;
#[cfg(feature = "ocr")]
pub mod render;
// Re-export has_full_render for runtime feature detection
#[cfg(all(feature = "ocr", feature = "full-render"))]
pub use render::pdfium_path::has_full_render;
pub mod schema;
pub mod semaphore;

View file

@ -87,6 +87,21 @@ pub struct ExtractionOptions {
///
/// Default: 512 MB (matches the plan's Tier 1 target for 100-page PDFs)
pub memory_budget_mb: usize,
/// Enable full-render path using PDFium for complex page rendering.
///
/// When true, pages are rendered using PDFium which correctly handles
/// overlapping images, soft masks, blend modes, and other complex geometry.
/// When false or when the `full-render` feature is not compiled in,
/// the direct compositing path is used (which handles >90% of scanned PDFs).
///
/// Default: false (direct compositing path)
///
/// # Feature Gate
///
/// This option has no effect unless the `full-render` feature is enabled.
/// When the feature is absent, this field is silently ignored and the
/// direct compositing path is always used.
pub full_render: bool,
}
impl Default for ExtractionOptions {
@ -95,6 +110,7 @@ impl Default for ExtractionOptions {
receipts: ReceiptsMode::default(),
max_parallel_pages: Self::default_max_parallel_pages(),
memory_budget_mb: Self::default_memory_budget_mb(),
full_render: false,
}
}
}
@ -126,6 +142,7 @@ impl ExtractionOptions {
pub fn with_receipts(receipts: ReceiptsMode) -> Self {
Self {
receipts,
full_render: false,
..Default::default()
}
}
@ -134,6 +151,7 @@ impl ExtractionOptions {
pub fn with_receipts_str(receipts: &str) -> Result<Self, String> {
Ok(Self {
receipts: ReceiptsMode::from_str(receipts)?,
full_render: false,
..Default::default()
})
}
@ -151,6 +169,7 @@ impl ExtractionOptions {
Self {
max_parallel_pages: max_parallel_pages.max(1),
memory_budget_mb: memory_budget_mb.max(64),
full_render: false,
..Default::default()
}
}

View file

@ -17,6 +17,10 @@
//! This module is only available when the `ocr` feature is enabled.
#![cfg(feature = "ocr")]
// PDFium rendering path (Phase 5.2.2) - only available with full-render feature
#[cfg(all(feature = "ocr", feature = "full-render"))]
pub mod pdfium_path;
use crate::graphics_state::{Matrix3x3, GraphicsStateStack, GraphicsState};
use crate::parser::lexer::Lexer;
use crate::parser::lexer::Token;

View file

@ -0,0 +1,327 @@
//! PDFium-based page rendering path (Phase 5.2.2).
//!
//! This module implements high-fidelity page rendering using PDFium,
//! which correctly handles:
//! - Overlapping images with proper blend modes
//! - Image masks and soft masks
//! - Transparency and alpha blending
//! - Shading patterns
//! - Complex color spaces
//!
//! # Feature Gate
//!
//! This module is only available when both `ocr` and `full-render` features are enabled.
use crate::diagnostics::{Diagnostic, DiagCode};
use image::{GrayImage, Luma};
use pdfium_render::prelude::*;
use std::sync::{Arc, Mutex};
use tracing::{debug, warn};
use std::thread::LocalKey;
/// Result type for PDFium rendering operations.
pub type Result<T> = std::result::Result<T, Vec<Diagnostic>>;
/// Thread-local PDFium instance holder with lazy initialization.
///
/// PDFium initialization is expensive (~50-100ms per instance), so we
/// maintain one instance per thread. The `thread_local!` macro ensures
/// each thread gets its own instance, avoiding synchronization overhead.
///
/// This uses `Option` to handle initialization failures gracefully.
struct ThreadLocalPdfium {
instance: Option<Arc<Pdfium>>,
init_failed: bool,
}
impl ThreadLocalPdfium {
fn new() -> Self {
Self {
instance: None,
init_failed: false,
}
}
fn get_or_init(&mut self) -> Option<Arc<Pdfium>> {
if self.init_failed {
return None;
}
if self.instance.is_none() {
// Try to bind to the system PDFium library
// This returns a Result, so we can handle errors gracefully
match Pdfium::bind_to_system_library() {
Ok(bindings) => {
debug!("PDFium initialized successfully");
let pdfium = Pdfium::new(bindings);
self.instance = Some(Arc::new(pdfium));
}
Err(e) => {
warn!("PDFium initialization failed: {:?}", e);
self.init_failed = true;
return None;
}
}
}
self.instance.clone()
}
}
thread_local! {
static PDFIUM_INSTANCE: Mutex<ThreadLocalPdfium> = Mutex::new(ThreadLocalPdfium::new());
}
/// Get the thread-local PDFium instance, if available.
///
/// Returns `None` if PDFium initialization failed (e.g., native library not found).
fn get_pdfium() -> Option<Arc<Pdfium>> {
PDFIUM_INSTANCE.try_with(|instance| {
let mut guard = instance.lock().unwrap();
guard.get_or_init()
}).ok().flatten()
}
/// Check if the full-render feature is available at runtime.
///
/// This function attempts to access PDFium and returns true if successful.
/// It's used by serve mode to validate `full_render` form-field requests.
///
/// # Returns
///
/// `true` if PDFium is available and can render pages, `false` otherwise.
pub fn has_full_render() -> bool {
get_pdfium().is_some()
}
/// Render a PDF page using PDFium.
///
/// This function:
/// 1. Loads the PDF document from bytes
/// 2. Opens the specified page
/// 3. Renders the page at the specified DPI
/// 4. Converts the result to a grayscale image
///
/// # Arguments
///
/// * `pdf_bytes` - The complete PDF document bytes
/// * `page_index` - Zero-based page index to render
/// * `dpi` - Resolution in dots per inch (default 300)
///
/// # Returns
///
/// The rendered grayscale image, or diagnostics if rendering fails.
///
/// # Errors
///
/// Returns diagnostics if:
/// - PDFium is not available (full-render feature not compiled or initialization failed)
/// - PDFium fails to load the document
/// - The page index is out of bounds
/// - Rendering fails
pub fn render_page_via_pdfium(
pdf_bytes: &[u8],
page_index: usize,
dpi: u32,
) -> Result<GrayImage> {
let mut diagnostics = Vec::new();
// Get the thread-local PDFium instance
let pdfium = match get_pdfium() {
Some(instance) => instance,
None => {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StructMissingKey,
"PDFium not available (full-render feature not compiled or initialization failed)",
));
return Err(diagnostics);
}
};
// Load the PDF document from memory
let document = match pdfium.load_pdf_from_byte_slice(pdf_bytes, None) {
Ok(doc) => doc,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructInvalidType,
format!("Failed to load PDF with PDFium: {:?}", e),
));
return Err(diagnostics);
}
};
// Check page count
let page_count = document.pages().len();
if page_index as i32 >= page_count {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Page index {} out of bounds (document has {} pages)", page_index, page_count),
));
return Err(diagnostics);
}
// Open the page
let page = match document.pages().get(page_index as i32) {
Ok(p) => p,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Failed to open page {}: {:?}", page_index, e),
));
return Err(diagnostics);
}
};
// Get page dimensions in points (1 point = 1/72 inch)
let page_width = page.width().value;
let page_height = page.height().value;
// Calculate rendering dimensions based on DPI
// PDF uses 72 points per inch as the base unit
let scale_factor = dpi as f32 / 72.0;
let render_width = (page_width * scale_factor).ceil() as i32;
let render_height = (page_height * scale_factor).ceil() as i32;
// Create render configuration
let render_config = PdfRenderConfig::new()
.set_target_width(render_width)
.set_target_height(render_height);
// Render the page to a bitmap using the config
let bitmap = match page.render_with_config(&render_config) {
Ok(bitmap) => bitmap,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::ImgUnsupportedFormat,
format!("Failed to render page with PDFium: {:?}", e),
));
return Err(diagnostics);
}
};
// Convert the bitmap to an image::DynamicImage
// The as_image() method returns a DynamicImage
let dynamic_image = match bitmap.as_image() {
Ok(img) => img,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::ImgUnsupportedFormat,
format!("Failed to convert PDFium bitmap to image: {:?}", e),
));
return Err(diagnostics);
}
};
// Convert to grayscale using luminance
let gray_image = dynamic_image.to_luma8();
Ok(gray_image)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "full-render")]
fn test_has_full_render() {
// When full-render feature is enabled, this should return true
// if PDFium native library is available
let result = has_full_render();
// We don't assert true/false because it depends on runtime environment
// Just verify it doesn't panic
let _ = result;
}
#[test]
#[cfg(feature = "full-render")]
fn test_render_minimal_pdf() {
// Create a minimal valid PDF
// This is a minimal PDF with one empty page
let minimal_pdf = b"%PDF-1.4\n\
1 0 obj\n\
<<\n/Type /Catalog\n/Pages 2 0 R\n\
>>\n\
endobj\n\
2 0 obj\n\
<<\n/Type /Pages\n/Kids [ 3 0 R ]\n/Count 1\n\
>>\n\
endobj\n\
3 0 obj\n\
<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [ 0 0 612 792 ]\n/Contents 4 0 R\n\
>>\n\
endobj\n\
4 0 obj\n\
<<\n/Length 44\n\
>>\n\
stream\n\
BT\n/F1 12 Tf\n100 700 Td\n(Test) Tj\nET\n\
endstream\n\
endobj\n\
xref\n\
0 5\n\
0000000000 65535 f\n\
0000000009 00000 n\n\
0000000058 00000 n\n\
0000000115 00000 n\n\
0000000214 00000 n\n\
trailer\n\
<<\n/Size 5\n/Root 1 0 R\n\
>>\n\
startxref\n\
310\n\
%%EOF";
// Try to render the page
// This test may fail if PDFium native library is not available
let result = render_page_via_pdfium(minimal_pdf, 0, 72);
// If PDFium is not available, we expect an error
if !has_full_render() {
assert!(result.is_err());
} else {
// If PDFium is available, we expect success
assert!(result.is_ok());
}
}
#[test]
#[cfg(feature = "full-render")]
fn test_render_invalid_page_index() {
let minimal_pdf = b"%PDF-1.4\n\
1 0 obj\n\
<<\n/Type /Catalog\n/Pages 2 0 R\n\
>>\n\
endobj\n\
2 0 obj\n\
<<\n/Type /Pages\n/Kids [ 3 0 R ]\n/Count 1\n\
>>\n\
endobj\n\
3 0 obj\n\
<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [ 0 0 612 792 ]\n\
>>\n\
endobj\n\
xref\n\
0 4\n\
0000000000 65535 f\n\
0000000009 00000 n\n\
0000000058 00000 n\n\
0000000115 00000 n\n\
trailer\n\
<<\n/Size 4\n/Root 1 0 R\n\
>>\n\
startxref\n\
202\n\
%%EOF";
// If PDFium is not available, this test should be skipped
if !has_full_render() {
return;
}
let result = render_page_via_pdfium(minimal_pdf, 99, 72);
assert!(result.is_err());
let diags = result.unwrap_err();
assert!(diags.iter().any(|d| d.code == DiagCode::StructMissingKey));
}
}