From 9abc386cce24caf4343f6748aea2f24a5cd51b2c Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 25 May 2026 13:40:15 -0400 Subject: [PATCH] feat(pdftract-3h9xo): implement threads JSON output + schema integration Phase 7.7.3: Add threads field to ExtractionResult with ThreadJson schema integration. Changes: - Added ThreadJson and BeadJson structs to schema/mod.rs - Added thread_to_json() function to threads/mod.rs - Added build_page_ref_to_index() helper to parser/pages.rs - Added threads field to ExtractionResult in extract.rs - Implemented Phase 7.7 extraction logic with discover_threads/walk_beads - Added threads_to_markdown() and collapse_page_ranges() to markdown.rs - Updated JSON schema with ThreadJson and BeadJson definitions - Added thread_to_py() and bead_to_py() conversions in pdftract-py - Exported ThreadJson, BeadJson from lib.rs All 32 threads module tests pass. All 35 markdown tests pass. Verification: notes/pdftract-3h9xo.md Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 313 +++++++++++++++++- Cargo.toml | 1 + crates/pdftract-cli/benches/grep_1000.rs | 18 +- .../src/inspect/render/columns.rs | 57 ++-- crates/pdftract-cli/src/main.rs | 9 +- crates/pdftract-cli/src/mcp/http.rs | 17 +- crates/pdftract-cli/src/middleware/mod.rs | 2 +- crates/pdftract-cli/src/serve.rs | 17 +- crates/pdftract-cli/tests/test_form.rs | 46 +-- crates/pdftract-core/src/extract.rs | 38 ++- crates/pdftract-core/src/lib.rs | 7 +- crates/pdftract-core/src/markdown.rs | 236 ++++++++++++- .../src/ocr/preprocessing/otsu.rs | 40 ++- crates/pdftract-core/src/parser/pages.rs | 39 +++ crates/pdftract-core/src/schema/mod.rs | 70 +++- crates/pdftract-core/src/threads/mod.rs | 26 ++ .../pdftract-core/tests/TH-01-stream-bomb.rs | 56 +++- crates/pdftract-py/src/lib.rs | 117 ++++--- docs/schema/v1.0/pdftract.schema.json | 142 ++++++++ notes/pdftract-2u6q2.md | 76 +++++ notes/pdftract-3h9xo.md | 113 +++++++ 21 files changed, 1312 insertions(+), 128 deletions(-) create mode 100644 notes/pdftract-2u6q2.md create mode 100644 notes/pdftract-3h9xo.md diff --git a/Cargo.lock b/Cargo.lock index d6006c8..d6a5c2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,22 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ab_glyph" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01c0457472c38ea5bd1c3b5ada5e368271cb550be7a4ca4a0b4634e9913f6cc2" +dependencies = [ + "ab_glyph_rasterizer", + "owned_ttf_parser 0.25.1", +] + +[[package]] +name = "ab_glyph_rasterizer" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618" + [[package]] name = "adler2" version = "2.0.1" @@ -135,6 +151,15 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "arbitrary" version = "1.4.2" @@ -1206,6 +1231,114 @@ dependencies = [ "weezl", ] +[[package]] +name = "glam" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "333928d5eb103c5d4050533cec0384302db6be8ef7d3cebd30ec6a35350353da" + +[[package]] +name = "glam" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3abb554f8ee44336b72d522e0a7fe86a29e09f839a36022fa869a7dfe941a54b" + +[[package]] +name = "glam" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4126c0479ccf7e8664c36a2d719f5f2c140fbb4f9090008098d2c291fa5b3f16" + +[[package]] +name = "glam" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01732b97afd8508eee3333a541b9f7610f454bb818669e66e90f5f57c93a776" + +[[package]] +name = "glam" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525a3e490ba77b8e326fb67d4b44b4bd2f920f44d4cc73ccec50adc68e3bee34" + +[[package]] +name = "glam" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b8509e6791516e81c1a630d0bd7fbac36d2fa8712a9da8662e716b52d5051ca" + +[[package]] +name = "glam" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43e957e744be03f5801a55472f593d43fabdebf25a4585db250f04d86b1675f" + +[[package]] +name = "glam" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "518faa5064866338b013ff9b2350dc318e14cc4fcd6cb8206d7e7c9886c98815" + +[[package]] +name = "glam" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f597d56c1bd55a811a1be189459e8fad2bbc272616375602443bdfb37fa774" + +[[package]] +name = "glam" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e4afd9ad95555081e109fe1d21f2a30c691b5f0919c67dfa690a2e1eb6bd51c" + +[[package]] +name = "glam" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945" + +[[package]] +name = "glam" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3" + +[[package]] +name = "glam" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e05e7e6723e3455f4818c7b26e855439f7546cf617ef669d1adedb8669e5cb9" + +[[package]] +name = "glam" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "779ae4bf7e8421cf91c0b3b64e7e8b40b862fba4d393f59150042de7c4965a94" + +[[package]] +name = "glam" +version = "0.29.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8babf46d4c1c9d92deac9f7be466f76dfc4482b6452fc5024b5e8daf6ffeb3ee" + +[[package]] +name = "glam" +version = "0.30.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19fc433e8437a212d1b6f1e68c7824af3aed907da60afa994e7f542d18d12aa9" + +[[package]] +name = "glam" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556f6b2ea90b8d15a74e0e7bb41671c9bdf38cd9f78c284d750b9ce58a2b5be7" + +[[package]] +name = "glam" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f70749695b063ecbf6b62949ccccde2e733ec3ecbbd71d467dca4e5c6c97cca0" + [[package]] name = "glob" version = "0.3.3" @@ -1664,6 +1797,25 @@ dependencies = [ "quick-error 2.0.1", ] +[[package]] +name = "imageproc" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645329c490783f3ea465d2b6c7c08286fece97f15e714fd533b6c70a3ead2252" +dependencies = [ + "ab_glyph", + "approx", + "getrandom 0.3.4", + "image 0.25.10", + "itertools 0.14.0", + "nalgebra", + "num", + "rand 0.9.4", + "rand_distr", + "rayon", + "rustdct", +] + [[package]] name = "imgref" version = "1.12.1" @@ -1979,6 +2131,16 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "maybe-owned" version = "0.3.4" @@ -2089,6 +2251,39 @@ dependencies = [ "version_check", ] +[[package]] +name = "nalgebra" +version = "0.34.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df76ea0ff5c7e6b88689085804d6132ded0ddb9de5ca5b8aeb9eeadc0508a70a" +dependencies = [ + "approx", + "glam 0.14.0", + "glam 0.15.2", + "glam 0.16.0", + "glam 0.17.3", + "glam 0.18.0", + "glam 0.19.0", + "glam 0.20.5", + "glam 0.21.3", + "glam 0.22.0", + "glam 0.23.0", + "glam 0.24.2", + "glam 0.25.0", + "glam 0.27.0", + "glam 0.28.0", + "glam 0.29.3", + "glam 0.30.10", + "glam 0.31.1", + "glam 0.32.1", + "matrixmultiply", + "num-complex", + "num-rational", + "num-traits", + "simba", + "typenum", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -2223,6 +2418,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2274,6 +2470,15 @@ dependencies = [ "ttf-parser 0.21.1", ] +[[package]] +name = "owned_ttf_parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36820e9051aca1014ddc75770aab4d68bc1e9e632f0f5627c4086bc216fb583b" +dependencies = [ + "ttf-parser 0.25.1", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -2353,6 +2558,7 @@ dependencies = [ "async-stream", "atty", "axum", + "base64", "bytes", "chrono", "clap", @@ -2400,6 +2606,7 @@ name = "pdftract-core" version = "0.1.0" dependencies = [ "anyhow", + "base64", "chrono", "criterion", "dashmap", @@ -2408,13 +2615,14 @@ dependencies = [ "flate2", "hex", "image 0.25.10", + "imageproc", "indexmap", "leptonica-plumbing", "libc", "lzw", "memchr", "memmap2", - "owned_ttf_parser", + "owned_ttf_parser 0.21.0", "pdfium-render", "phf", "phf_codegen", @@ -2456,6 +2664,7 @@ name = "pdftract-py" version = "0.1.0" dependencies = [ "anyhow", + "base64", "pdftract-core", "pyo3", ] @@ -2665,6 +2874,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "primal-check" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08" +dependencies = [ + "num-integer", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -2946,6 +3164,16 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +dependencies = [ + "num-traits", + "rand 0.9.4", +] + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -3005,6 +3233,12 @@ dependencies = [ "rgb", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.12.0" @@ -3167,6 +3401,29 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustdct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b61555105d6a9bf98797c063c362a1d24ed8ab0431655e38f1cf51e52089551" +dependencies = [ + "rustfft", +] + +[[package]] +name = "rustfft" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89" +dependencies = [ + "num-complex", + "num-integer", + "num-traits", + "primal-check", + "strength_reduce", + "transpose", +] + [[package]] name = "rustix" version = "0.38.44" @@ -3253,6 +3510,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safe_arch" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" +dependencies = [ + "bytemuck", +] + [[package]] name = "same-file" version = "1.0.6" @@ -3458,6 +3724,19 @@ dependencies = [ "libc", ] +[[package]] +name = "simba" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c99284beb21666094ba2b75bbceda012e610f5479dfcc2d6e2426f53197ffd95" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + [[package]] name = "simd-adler32" version = "0.3.9" @@ -3534,6 +3813,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "strsim" version = "0.11.1" @@ -4039,6 +4324,16 @@ dependencies = [ "once_cell", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "try-lock" version = "0.2.5" @@ -4057,6 +4352,12 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be21190ff5d38e8b4a2d3b6a3ae57f612cc39c96e83cedeaf7abc338a8bac4a" +[[package]] +name = "ttf-parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" + [[package]] name = "typenum" version = "1.20.0" @@ -4423,6 +4724,16 @@ dependencies = [ "rustix 0.38.44", ] +[[package]] +name = "wide" +version = "0.7.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce5da8ecb62bcd8ec8b7ea19f69a51275e91299be594ea5cc6ef7819e16cd03" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index 9e01d47..a62f463 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ documentation = "https://docs.rs/pdftract-core" [workspace.dependencies] # Dependencies shared across workspace crates anyhow = "1.0" +base64 = "0.22" flate2 = "1.0" lzw = "0.10" memchr = "2.7" diff --git a/crates/pdftract-cli/benches/grep_1000.rs b/crates/pdftract-cli/benches/grep_1000.rs index f653b59..894b38a 100644 --- a/crates/pdftract-cli/benches/grep_1000.rs +++ b/crates/pdftract-cli/benches/grep_1000.rs @@ -124,10 +124,7 @@ impl BenchmarkResult { // 50 MB/s gate let throughput = self.calculate_throughput(); if throughput < 50.0 { - return Err(format!( - "Throughput {} MB/s below 50 MB/s gate", - throughput - )); + return Err(format!("Throughput {} MB/s below 50 MB/s gate", throughput)); } // TODO: Add pdfgrep and pdftotext+ripgrep comparisons @@ -183,7 +180,12 @@ fn count_corpus_files() -> usize { .map(|entries| { entries .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map(|ext| ext == "pdf").unwrap_or(false)) + .filter(|e| { + e.path() + .extension() + .map(|ext| ext == "pdf") + .unwrap_or(false) + }) .count() }) .unwrap_or(0) @@ -224,7 +226,11 @@ fn run_benchmark() -> Result { }); } - eprintln!("Benchmark corpus: {} files, {} MB", files_total, bytes_total / 1024 / 1024); + eprintln!( + "Benchmark corpus: {} files, {} MB", + files_total, + bytes_total / 1024 / 1024 + ); // TODO: Run actual grep search // For now, this is a placeholder that simulates the benchmark structure diff --git a/crates/pdftract-cli/src/inspect/render/columns.rs b/crates/pdftract-cli/src/inspect/render/columns.rs index a74888d..0710186 100644 --- a/crates/pdftract-cli/src/inspect/render/columns.rs +++ b/crates/pdftract-cli/src/inspect/render/columns.rs @@ -37,15 +37,19 @@ use pdftract_core::layout::columns::Column; /// - `data-x0`: the column's left x-coordinate /// - `data-x1`: the column's right x-coordinate pub fn render_columns(columns: &[Column], page_height: f32) -> Vec { - columns.iter().enumerate().flat_map(|(idx, col)| { - let left_color = boundary_color(idx, true); - let right_color = boundary_color(idx, false); + columns + .iter() + .enumerate() + .flat_map(|(idx, col)| { + let left_color = boundary_color(idx, true); + let right_color = boundary_color(idx, false); - vec![ - render_left_boundary(col, page_height, left_color), - render_right_boundary(col, page_height, right_color), - ] - }).collect() + vec![ + render_left_boundary(col, page_height, left_color), + render_right_boundary(col, page_height, right_color), + ] + }) + .collect() } /// Render the left boundary (x0) of a column. @@ -83,7 +87,11 @@ fn boundary_color(column_index: usize, is_left: bool) -> &'static str { ]; let (light, dark) = PALETTE[column_index % PALETTE.len()]; - if is_left { light } else { dark } + if is_left { + light + } else { + dark + } } #[cfg(test)] @@ -153,19 +161,30 @@ mod tests { let result = render_columns(&columns, 792.0); // Check that colors cycle correctly - let left_colors: Vec<&str> = result.iter() + let left_colors: Vec<&str> = result + .iter() .step_by(2) .filter(|s| s.contains("column-left")) .map(|s| { - if s.contains("#06b6d4") { "#06b6d4" } - else if s.contains("#d946ef") { "#d946ef" } - else if s.contains("#facc15") { "#facc15" } - else if s.contains("#22c55e") { "#22c55e" } - else if s.contains("#f97316") { "#f97316" } - else if s.contains("#3b82f6") { "#3b82f6" } - else if s.contains("#a855f7") { "#a855f7" } - else if s.contains("#f43f5e") { "#f43f5e" } - else { "unknown" } + if s.contains("#06b6d4") { + "#06b6d4" + } else if s.contains("#d946ef") { + "#d946ef" + } else if s.contains("#facc15") { + "#facc15" + } else if s.contains("#22c55e") { + "#22c55e" + } else if s.contains("#f97316") { + "#f97316" + } else if s.contains("#3b82f6") { + "#3b82f6" + } else if s.contains("#a855f7") { + "#a855f7" + } else if s.contains("#f43f5e") { + "#f43f5e" + } else { + "unknown" + } }) .collect(); diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index 440f6f0..adbdbdb 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -473,7 +473,14 @@ fn main() -> Result<()> { max_upload_mb, audit_log, } => { - if let Err(e) = cmd_serve(bind, cache_dir, &cache_size, no_cache, max_upload_mb, audit_log) { + if let Err(e) = cmd_serve( + bind, + cache_dir, + &cache_size, + no_cache, + max_upload_mb, + audit_log, + ) { eprintln!("Error: {}", e); std::process::exit(1); } diff --git a/crates/pdftract-cli/src/mcp/http.rs b/crates/pdftract-cli/src/mcp/http.rs index 9c3e83a..f7b7677 100644 --- a/crates/pdftract-cli/src/mcp/http.rs +++ b/crates/pdftract-cli/src/mcp/http.rs @@ -23,7 +23,7 @@ use crate::mcp::framing::{BatchMessage, ErrorObject, Id, Notification, Request, Response}; use crate::mcp::tools; -use crate::middleware::{AuditState, audit_middleware}; +use crate::middleware::{audit_middleware, AuditState}; use anyhow::{anyhow, Context, Result}; use axum::{ body::Body, @@ -144,16 +144,21 @@ pub async fn run_server( ) -> Result<()> { // Create audit log writer if specified let audit_writer = if let Some(ref path) = audit_log { - Some(AuditLogWriter::open(path).context(format!( - "Failed to open audit log: {}", - path.display() - ))?) + Some( + AuditLogWriter::open(path) + .context(format!("Failed to open audit log: {}", path.display()))?, + ) } else { None }; // Create the shared server state - let state = McpServerState::new(auth_token, max_upload_mb, root.map(|p| p.to_path_buf()), audit_writer); + let state = McpServerState::new( + auth_token, + max_upload_mb, + root.map(|p| p.to_path_buf()), + audit_writer, + ); let max_body_bytes = state.max_body_bytes; // Build the router diff --git a/crates/pdftract-cli/src/middleware/mod.rs b/crates/pdftract-cli/src/middleware/mod.rs index 6eff652..985fa2b 100644 --- a/crates/pdftract-cli/src/middleware/mod.rs +++ b/crates/pdftract-cli/src/middleware/mod.rs @@ -2,4 +2,4 @@ pub mod audit; -pub use audit::{AuditState, audit_middleware}; +pub use audit::{audit_middleware, AuditState}; diff --git a/crates/pdftract-cli/src/serve.rs b/crates/pdftract-cli/src/serve.rs index 27a7e03..c050af4 100644 --- a/crates/pdftract-cli/src/serve.rs +++ b/crates/pdftract-cli/src/serve.rs @@ -43,6 +43,7 @@ //! - `EXTRACTION_ERROR`: PDF parsing or extraction failure //! - `INTERNAL_PANIC`: spawn_blocking task panicked (indicates a bug) +use crate::middleware::{audit_middleware, AuditState}; use anyhow::{Context, Result}; use axum::{ body::Body, @@ -57,7 +58,6 @@ use pdftract_core::audit::AuditLogWriter; use pdftract_core::cache; use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, result_to_json}; use pdftract_core::options::{ExtractionOptions, ReceiptsMode}; -use crate::middleware::{AuditState, audit_middleware}; use serde::Deserialize; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -174,15 +174,20 @@ pub async fn run( // Create audit log writer if specified let audit_writer = if let Some(ref path) = audit_log { - Some(AuditLogWriter::open(path).context(format!( - "Failed to open audit log: {}", - path.display() - ))?) + Some( + AuditLogWriter::open(path) + .context(format!("Failed to open audit log: {}", path.display()))?, + ) } else { None }; - let state = ServeState::new(cache_dir.clone(), cache_size_bytes, cache_disabled, audit_writer); + let state = ServeState::new( + cache_dir.clone(), + cache_size_bytes, + cache_disabled, + audit_writer, + ); let max_body_bytes = max_upload_mb * 1024 * 1024; diff --git a/crates/pdftract-cli/tests/test_form.rs b/crates/pdftract-cli/tests/test_form.rs index 496ced9..14ad8b8 100644 --- a/crates/pdftract-cli/tests/test_form.rs +++ b/crates/pdftract-cli/tests/test_form.rs @@ -35,13 +35,7 @@ fn profile_path() -> PathBuf { } /// Form fixture names -const FORM_FIXTURES: &[&str] = &[ - "irs_1040", - "w2", - "i9", - "expense_report", - "intake_form", -]; +const FORM_FIXTURES: &[&str] = &["irs_1040", "w2", "i9", "expense_report", "intake_form"]; /// Expected output file suffix const EXPECTED_SUFFIX: &str = "-expected.json"; @@ -71,8 +65,14 @@ fn test_form_profile_exists() { content.contains("priority:"), "Profile missing 'priority' key" ); - assert!(content.contains("threshold:"), "Profile missing 'threshold' key"); - assert!(content.contains("predicates:"), "Profile missing 'predicates' key"); + assert!( + content.contains("threshold:"), + "Profile missing 'threshold' key" + ); + assert!( + content.contains("predicates:"), + "Profile missing 'predicates' key" + ); // Verify form profile has type: form assert!(content.contains("type:"), "Profile missing 'type' key"); @@ -91,10 +91,7 @@ fn test_form_fixture_structure() { // Verify README.md exists let readme_path = fixture_dir.join("README.md"); - assert!( - readme_path.exists(), - "Missing README.md in form fixtures" - ); + assert!(readme_path.exists(), "Missing README.md in form fixtures"); // Verify PROVENANCE.md exists let provenance_path = fixture_dir.join("PROVENANCE.md"); @@ -165,10 +162,12 @@ fn test_form_fixture_structure() { ); // Verify document_type_confidence is present and valid - let confidence = json.pointer("/metadata/document_type_confidence").expect(&format!( - "Missing /metadata/document_type_confidence in {}", - expected_path.display() - )); + let confidence = json + .pointer("/metadata/document_type_confidence") + .expect(&format!( + "Missing /metadata/document_type_confidence in {}", + expected_path.display() + )); assert!( confidence.as_f64().is_some(), @@ -240,7 +239,10 @@ fn test_form_profile_schema() { let predicate_kinds: Vec = predicates .iter() - .filter_map(|p| p.get("kind").and_then(|k| k.as_str().map(|s| s.to_string()))) + .filter_map(|p| { + p.get("kind") + .and_then(|k| k.as_str().map(|s| s.to_string())) + }) .collect(); assert!( @@ -272,8 +274,8 @@ fn test_form_profile_is_degenerate() { // but the extraction profile (classification/form.yaml) should have // profile_fields: {} (empty object) - let extraction_profile_path = workspace_root() - .join("profiles/builtin/classification/form.yaml"); + let extraction_profile_path = + workspace_root().join("profiles/builtin/classification/form.yaml"); assert!( extraction_profile_path.exists(), @@ -281,8 +283,8 @@ fn test_form_profile_is_degenerate() { extraction_profile_path.display() ); - let extraction_content = fs::read_to_string(extraction_profile_path) - .expect("Failed to read extraction profile"); + let extraction_content = + fs::read_to_string(extraction_profile_path).expect("Failed to read extraction profile"); // Parse YAML to verify profile_fields is empty let yaml_value: serde_yaml::Value = diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs index 9f1cb64..e16e414 100644 --- a/crates/pdftract-core/src/extract.rs +++ b/crates/pdftract-core/src/extract.rs @@ -30,7 +30,7 @@ use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree}; use crate::receipts::Receipt; use crate::schema::{ AnnotationJson, AttachmentJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, - FormFieldValueJson, LinkJson, SignatureJson, SpanJson, TableJson, + FormFieldValueJson, LinkJson, SignatureJson, SpanJson, TableJson, ThreadJson, }; use crate::semaphore::{Semaphore, SemaphoreExt}; use crate::signature::{discover, extract_signatures}; @@ -152,6 +152,13 @@ pub struct ExtractionResult { /// 50 MB are truncated (metadata only, `data: null`, `truncated: true`). /// Empty when the PDF has no embedded files. pub attachments: Vec, + /// Article thread chains extracted from the document. + /// + /// This array contains all article threads from the PDF's `/Threads` array. + /// Each thread includes metadata from the thread info dict (/I) and the + /// complete bead chain walked from the first bead. Empty when the PDF has + /// no article threads. + pub threads: Vec, } /// Result for a single page. @@ -622,6 +629,34 @@ pub fn extract_pdf( .map(|(name, value)| convert_form_field_to_json(name, value, &resolver_arc, &catalog)) .collect(); + // Phase 7.7: Extract article thread chains + // Discover thread headers from /Threads array and walk bead chains + use crate::parser::pages::build_page_ref_to_index; + use crate::threads::{discover as discover_threads, thread_to_json, walk_beads}; + + // Build page ref to index map for bead chain walking + let page_ref_to_index = build_page_ref_to_index(&catalog, &resolver_arc); + + // Discover thread headers from /Threads array + let thread_headers = match discover_threads(&catalog, &resolver_arc) { + Ok(headers) => headers, + Err(_) => Vec::new(), // Return empty on error + }; + + // Walk bead chains for each thread and convert to JSON + let mut threads_json = Vec::new(); + for header in &thread_headers { + match walk_beads(header, &resolver_arc, &page_ref_to_index) { + Ok(beads) => { + threads_json.push(thread_to_json(header, &beads)); + } + Err(_) => { + // Skip threads with malformed bead chains but continue processing others + continue; + } + } + } + Ok(ExtractionResult { fingerprint, pages: extracted_pages, @@ -640,6 +675,7 @@ pub fn extract_pdf( form_fields, links: links_json, attachments, + threads: threads_json, }) } diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 55ffe81..973fe7e 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -72,7 +72,8 @@ pub use options::{ExtractionOptions, ReceiptsMode}; pub use page_class::{page_type_string, PageClass, PageClassification}; pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX}; pub use schema::{ - AttachmentJson, BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson, + AttachmentJson, BeadJson, BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson, + ThreadJson, }; pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector}; pub use text::{serialize_page_text, TextOptions}; @@ -85,7 +86,9 @@ pub use hybrid::{ merge_vector_and_ocr_spans, CellCrop, Span, SpanSource, }; #[cfg(feature = "ocr")] -pub use ocr::preprocessing::{histogram_stretch, histogram_stretch_if_needed, otsu_binarize, PreprocError}; +pub use ocr::preprocessing::{ + histogram_stretch, histogram_stretch_if_needed, otsu_binarize, PreprocError, +}; #[cfg(feature = "ocr")] pub use ocr::{ borrow_or_init, calculate_wer, detect_available_languages, init_count, parse_hocr, diff --git a/crates/pdftract-core/src/markdown.rs b/crates/pdftract-core/src/markdown.rs index 8669ab8..9037c02 100644 --- a/crates/pdftract-core/src/markdown.rs +++ b/crates/pdftract-core/src/markdown.rs @@ -36,7 +36,8 @@ //! ``` use crate::schema::{ - BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, SpanJson, + BeadJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, SpanJson, + ThreadJson, }; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -592,6 +593,128 @@ fn escape_pipe(s: &str) -> String { s.replace('|', "\\|") } +/// Generate a markdown footer section for article threads. +/// +/// This function creates a formatted markdown section listing all article +/// threads with their metadata and page ranges. Only emits the section +/// when threads count > 0. +/// +/// # Arguments +/// +/// * `threads` - The threads to include in the footer +/// +/// # Returns +/// +/// A markdown string with an article threads section, or an empty string if no threads. +/// +/// # Example +/// +/// ```ignore +/// use pdftract_core::markdown::threads_to_markdown; +/// use pdftract_core::schema::{ThreadJson, BeadJson}; +/// +/// let threads = vec![ +/// ThreadJson { +/// title: Some("Main Article".to_string()), +/// author: Some("John Doe".to_string()), +/// subject: None, +/// keywords: None, +/// beads: vec![ +/// BeadJson { page_index: 0, rect: [100.0, 200.0, 300.0, 220.0] }, +/// BeadJson { page_index: 1, rect: [100.0, 500.0, 300.0, 520.0] }, +/// ], +/// }, +/// ]; +/// +/// let md = threads_to_markdown(&threads); +/// assert!(md.contains("## Article Threads")); +/// assert!(md.contains("1. *Main Article* (John Doe) - pages 0-1 (2 beads)")); +/// ``` +pub fn threads_to_markdown(threads: &[ThreadJson]) -> String { + if threads.is_empty() { + return String::new(); + } + + let mut result = String::from("\n\n## Article Threads\n\n"); + + for (i, thread) in threads.iter().enumerate() { + // Build the thread title line + let title = thread.title.as_deref().unwrap_or("(Untitled)"); + let author = thread.author.as_deref().unwrap_or(""); + + // Collapse contiguous page ranges + let page_ranges = collapse_page_ranges(&thread.beads); + + // Format: "1. *Title* (Author) - pages 0-1, 3-5 (3 beads)" + result.push_str(&format!( + "{}. *{}* ({}) - {} ({} beads)\n", + i + 1, + title, + author, + page_ranges, + thread.beads.len() + )); + } + + result +} + +/// Collapse contiguous page indices into ranges. +/// +/// Given a list of beads with page indices, this function collapses +/// contiguous sequences into ranges for more compact display. +/// +/// # Arguments +/// +/// * `beads` - The beads to collapse into page ranges +/// +/// # Returns +/// +/// A string like "pages 0-1, 3-5" representing the page ranges. +fn collapse_page_ranges(beads: &[BeadJson]) -> String { + if beads.is_empty() { + return "no pages".to_string(); + } + + let mut ranges = Vec::new(); + let mut start = beads[0].page_index; + let mut end = beads[0].page_index; + + for bead in beads.iter().skip(1) { + // Skip duplicate page indices + if bead.page_index == end { + continue; + } + + if bead.page_index == end + 1 { + // Contiguous, extend the range + end = bead.page_index; + } else { + // Gap, emit the current range + ranges.push((start, end)); + start = bead.page_index; + end = bead.page_index; + } + } + + // Emit the last range + ranges.push((start, end)); + + // Format ranges + let parts: Vec = ranges + .iter() + .map(|&(s, e)| { + if s == e { + format!("{}", s) + } else { + format!("{}-{}", s, e) + } + }) + .collect(); + + format!("pages {}", parts.join(", ")) +} + /// Convert a span to markdown with inline styling based on flags. /// /// This function implements Phase 6.5 inline span styling, translating @@ -1010,4 +1133,115 @@ mod span_tests { "HELLO\\_WORLD" ); } + + #[test] + fn test_threads_to_markdown_empty() { + // Empty threads list returns empty string + let threads: Vec = vec![]; + assert_eq!(threads_to_markdown(&threads), ""); + } + + #[test] + fn test_threads_to_markdown_single_thread() { + // Single thread with multiple beads + let threads = vec![ThreadJson { + title: Some("Main Article".to_string()), + author: Some("John Doe".to_string()), + subject: None, + keywords: None, + beads: vec![ + BeadJson { page_index: 0, rect: [100.0, 200.0, 300.0, 220.0] }, + BeadJson { page_index: 1, rect: [100.0, 500.0, 300.0, 520.0] }, + ], + }]; + + let md = threads_to_markdown(&threads); + assert!(md.contains("## Article Threads")); + assert!(md.contains("1. *Main Article* (John Doe) - pages 0-1 (2 beads)")); + } + + #[test] + fn test_threads_to_markdown_multiple_threads() { + // Multiple threads with various metadata + let threads = vec![ + ThreadJson { + title: Some("Introduction".to_string()), + author: Some("Jane Smith".to_string()), + subject: None, + keywords: None, + beads: vec![BeadJson { page_index: 0, rect: [50.0, 100.0, 250.0, 120.0] }], + }, + ThreadJson { + title: Some("Main Content".to_string()), + author: None, + subject: Some("Chapter 1".to_string()), + keywords: Some("test, example".to_string()), + beads: vec![ + BeadJson { page_index: 1, rect: [50.0, 400.0, 250.0, 420.0] }, + BeadJson { page_index: 2, rect: [50.0, 100.0, 250.0, 120.0] }, + ], + }, + ]; + + let md = threads_to_markdown(&threads); + assert!(md.contains("1. *Introduction* (Jane Smith) - pages 0 (1 beads)")); + assert!(md.contains("2. *Main Content* () - pages 1-2 (2 beads)")); + } + + #[test] + fn test_threads_to_markdown_untitled_thread() { + // Thread with no title + let threads = vec![ThreadJson { + title: None, + author: None, + subject: None, + keywords: None, + beads: vec![BeadJson { page_index: 5, rect: [100.0, 200.0, 300.0, 220.0] }], + }]; + + let md = threads_to_markdown(&threads); + assert!(md.contains("1. *(Untitled)* () - pages 5 (1 beads)")); + } + + #[test] + fn test_collapse_page_ranges_single_page() { + // Single bead + let beads = vec![BeadJson { page_index: 3, rect: [0.0, 0.0, 100.0, 20.0] }]; + assert_eq!(collapse_page_ranges(&beads), "pages 3"); + } + + #[test] + fn test_collapse_page_ranges_contiguous() { + // Contiguous pages + let beads = vec![ + BeadJson { page_index: 0, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 1, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 2, rect: [0.0, 0.0, 100.0, 20.0] }, + ]; + assert_eq!(collapse_page_ranges(&beads), "pages 0-2"); + } + + #[test] + fn test_collapse_page_ranges_gaps() { + // Pages with gaps + let beads = vec![ + BeadJson { page_index: 0, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 2, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 5, rect: [0.0, 0.0, 100.0, 20.0] }, + ]; + assert_eq!(collapse_page_ranges(&beads), "pages 0, 2, 5"); + } + + #[test] + fn test_collapse_page_ranges_mixed() { + // Mixed contiguous and gaps + let beads = vec![ + BeadJson { page_index: 0, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 1, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 3, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 4, rect: [0.0, 0.0, 100.0, 20.0] }, + BeadJson { page_index: 4, rect: [0.0, 0.0, 100.0, 20.0] }, + ]; + assert_eq!(collapse_page_ranges(&beads), "pages 0-1, 3-4"); + } } diff --git a/crates/pdftract-core/src/ocr/preprocessing/otsu.rs b/crates/pdftract-core/src/ocr/preprocessing/otsu.rs index b8e1dd6..1023062 100644 --- a/crates/pdftract-core/src/ocr/preprocessing/otsu.rs +++ b/crates/pdftract-core/src/ocr/preprocessing/otsu.rs @@ -123,10 +123,8 @@ mod tests { // Verify foreground/background separation: // - Left half (dark) should become 0 (black) // - Right half (light) should become 255 (white) - let left_half_is_black = (0..100) - .all(|x| binary.get_pixel(x, 100)[0] == 0); - let right_half_is_white = (100..200) - .all(|x| binary.get_pixel(x, 100)[0] == 255); + let left_half_is_black = (0..100).all(|x| binary.get_pixel(x, 100)[0] == 0); + let right_half_is_white = (100..200).all(|x| binary.get_pixel(x, 100)[0] == 255); assert!( left_half_is_black, @@ -180,7 +178,11 @@ mod tests { // Should still produce binary output (all 0 or all 255) for pixel in binary_dark.pixels() { let val = pixel[0]; - assert!(val == 0 || val == 255, "Uniform dark image should binarize to 0 or 255, got {}", val); + assert!( + val == 0 || val == 255, + "Uniform dark image should binarize to 0 or 255, got {}", + val + ); } // Test 2: Uniform light image @@ -191,7 +193,11 @@ mod tests { let binary_light = otsu_binarize(&light_img); for pixel in binary_light.pixels() { let val = pixel[0]; - assert!(val == 0 || val == 255, "Uniform light image should binarize to 0 or 255, got {}", val); + assert!( + val == 0 || val == 255, + "Uniform light image should binarize to 0 or 255, got {}", + val + ); } // Test 3: Very narrow histogram (values in [100, 101]) @@ -209,7 +215,11 @@ mod tests { // Should still produce binary output without panic for pixel in binary_narrow.pixels() { let val = pixel[0]; - assert!(val == 0 || val == 255, "Narrow histogram image should binarize to 0 or 255, got {}", val); + assert!( + val == 0 || val == 255, + "Narrow histogram image should binarize to 0 or 255, got {}", + val + ); } } @@ -254,7 +264,9 @@ mod tests { assert!( pixel == 0 || pixel == 255, "Tri-modal image should still produce binary output, got {} at ({}, {})", - pixel, x, y + pixel, + x, + y ); } } @@ -278,7 +290,7 @@ mod tests { for line in 0..10 { let y = 30 + line * 25; for x in 50..350 { - img.put_pixel(x, y, Luma([40])); // Dark text + img.put_pixel(x, y, Luma([40])); // Dark text img.put_pixel(x, y + 1, Luma([40])); img.put_pixel(x, y + 2, Luma([40])); } @@ -293,7 +305,9 @@ mod tests { assert!( pixel == 0 || pixel == 255, "Text-like image should produce binary output, got {} at ({}, {})", - pixel, x, y + pixel, + x, + y ); } } @@ -302,7 +316,11 @@ mod tests { // Check a text line pixel assert_eq!(binary.get_pixel(100, 31)[0], 0, "Text line should be black"); // Check background pixel - assert_eq!(binary.get_pixel(100, 20)[0], 255, "Background should be white"); + assert_eq!( + binary.get_pixel(100, 20)[0], + 255, + "Background should be white" + ); } /// Test: Otsu on small image (edge case for dimensions) diff --git a/crates/pdftract-core/src/parser/pages.rs b/crates/pdftract-core/src/parser/pages.rs index 339ffa5..899faaf 100644 --- a/crates/pdftract-core/src/parser/pages.rs +++ b/crates/pdftract-core/src/parser/pages.rs @@ -723,6 +723,45 @@ fn parse_contents_array(obj: Option<&PdfObject>) -> Vec { } } +/// Build a map from page ObjRef to 0-based page index. +/// +/// This function walks the page tree and creates a HashMap that maps +/// each page's object reference to its 0-based index in document order. +/// This is useful for features like thread bead chain walking that need +/// to resolve page references to page indices. +/// +/// # Arguments +/// +/// * `catalog` - The document catalog containing the /Pages reference +/// * `resolver` - The xref resolver for resolving indirect references +/// +/// # Returns +/// +/// A HashMap mapping page references to their 0-based indices. +/// +/// # Behavior +/// +/// - Empty /Pages tree: returns empty HashMap +/// - Pages are indexed in document order (left-to-right depth-first traversal) +/// - Missing or unresolvable pages are skipped +pub fn build_page_ref_to_index( + catalog: &crate::parser::catalog::Catalog, + resolver: &XrefResolver, +) -> std::collections::HashMap { + use std::collections::HashMap; + + let mut page_ref_to_index = HashMap::new(); + + // Flatten the page tree to get all pages in order + if let Ok(pages) = flatten_page_tree(resolver, catalog.pages_ref) { + for (index, page) in pages.iter().enumerate() { + page_ref_to_index.insert(page.obj_ref, index); + } + } + + page_ref_to_index +} + #[cfg(test)] fn make_pages_dict(kids: Vec, count: i64, media_box: Option<[f64; 4]>) -> PdfObject { let mut dict = PdfDict::new(); diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs index 9eb9be4..53199b8 100644 --- a/crates/pdftract-core/src/schema/mod.rs +++ b/crates/pdftract-core/src/schema/mod.rs @@ -796,13 +796,77 @@ fn default_conformance() -> String { "none".to_string() } -/// Placeholder for Phase 7 article threads. +/// A single bead in an article thread chain. /// -/// This type is reserved for future use and currently has no fields. +/// Represents one bead's position on a page, extracted during bead chain walking. +/// Per PDF 1.7 Section 12.4.3, each bead contains a reference to its page and +/// a bounding rectangle defining the article region on that page. +/// +/// # Fields +/// +/// * `page_index` - 0-based index of the page containing this bead +/// * `rect` - Bounding rectangle of the bead region in PDF user-space coordinates [x0, y0, x1, y1] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub struct BeadJson { + /// 0-based page index where this bead is located. + pub page_index: usize, + + /// Bounding rectangle in PDF user-space coordinates [x0, y0, x1, y1]. + /// + /// Per PDF spec, the origin is at the bottom-left corner of the page. + /// This rect is NOT flipped to image-space coordinates. + pub rect: [f32; 4], +} + +/// JSON representation of an article thread. +/// +/// Represents a single article thread from the PDF's /Threads array, +/// including metadata from the thread info dict (/I) and the complete +/// bead chain walked from the first bead. +/// +/// Per the plan (Phase 7.7), threads are extracted and emitted at the +/// document level in the `/threads` array. The bead chain is walked by +/// following `/N` (next bead) links from the first bead until termination. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct ThreadJson { - // Reserved for Phase 7.1 + /// Thread title from /I/Title. + /// + /// - `Some("")` if /I/Title is present but empty string + /// - `None` if /I is missing or /Title is absent + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option, + + /// Thread author from /I/Author. + /// + /// - `Some("")` if /I/Author is present but empty string + /// - `None` if /I is missing or /Author is absent + #[serde(skip_serializing_if = "Option::is_none")] + pub author: Option, + + /// Thread subject from /I/Subject. + /// + /// - `Some("")` if /I/Subject is present but empty string + /// - `None` if /I is missing or /Subject is absent + #[serde(skip_serializing_if = "Option::is_none")] + pub subject: Option, + + /// Thread keywords from /I/Keywords. + /// + /// Per PDF spec, this is a comma-separated convention (not an array). + /// - `Some("")` if /I/Keywords is present but empty string + /// - `None` if /I is missing or /Keywords is absent + #[serde(skip_serializing_if = "Option::is_none")] + pub keywords: Option, + + /// Beads in this thread chain, in traversal order. + /// + /// Each bead represents a region on a page that is part of this article. + /// The beads are ordered by following `/N` (next bead) links from the + /// first bead through the chain until termination. + #[serde(default)] + pub beads: Vec, } /// JSON representation of an embedded file attachment. diff --git a/crates/pdftract-core/src/threads/mod.rs b/crates/pdftract-core/src/threads/mod.rs index 58d3131..52de5af 100644 --- a/crates/pdftract-core/src/threads/mod.rs +++ b/crates/pdftract-core/src/threads/mod.rs @@ -600,6 +600,32 @@ fn decode_pdfdocencoding(bytes: &[u8]) -> Option { Some(bytes.iter().map(|&b| b as char).collect()) } +/// Convert a `ThreadHeader` and `Bead` chain to JSON output format. +/// +/// This function constructs a `ThreadJson` from the internal thread representation, +/// combining the thread header metadata with the walked bead chain. +/// +/// # Arguments +/// +/// * `header` - The thread header containing metadata from /I +/// * `beads` - The walked bead chain from `walk_beads` +/// +/// # Returns +/// +/// A `ThreadJson` ready for JSON serialization. +pub fn thread_to_json(header: &ThreadHeader, beads: &[Bead]) -> crate::schema::ThreadJson { + crate::schema::ThreadJson { + title: header.title.clone(), + author: header.author.clone(), + subject: header.subject.clone(), + keywords: header.keywords.clone(), + beads: beads.iter().map(|bead| crate::schema::BeadJson { + page_index: bead.page_index, + rect: bead.rect, + }).collect(), + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/pdftract-core/tests/TH-01-stream-bomb.rs b/crates/pdftract-core/tests/TH-01-stream-bomb.rs index 721375a..118f7ab 100644 --- a/crates/pdftract-core/tests/TH-01-stream-bomb.rs +++ b/crates/pdftract-core/tests/TH-01-stream-bomb.rs @@ -62,18 +62,34 @@ fn test_bomb_default_cap_allows_reasonable_decompression() { // Decompress with default cap (512 MB) let mut counter = 0u64; - let result = FlateDecoder.decode(&compressed, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES); + let result = FlateDecoder.decode( + &compressed, + None, + &mut counter, + DEFAULT_MAX_DECOMPRESS_BYTES, + ); // Should succeed without error - assert!(result.is_ok(), "decompression should succeed with default cap"); + assert!( + result.is_ok(), + "decompression should succeed with default cap" + ); let decompressed = result.unwrap(); // Should get the full 10 MB - assert_eq!(decompressed.len(), 10 * 1024 * 1024_usize, "should decompress to 10 MB"); + assert_eq!( + decompressed.len(), + 10 * 1024 * 1024_usize, + "should decompress to 10 MB" + ); // Counter should reflect the decompressed size - assert_eq!(counter, 10 * 1024 * 1024_u64, "counter should match decompressed size"); + assert_eq!( + counter, + 10 * 1024 * 1024_u64, + "counter should match decompressed size" + ); } /// Test case 2: Lowered cap triggers STREAM_BOMB abort @@ -95,7 +111,10 @@ fn test_bomb_lowered_cap_triggers_stream_bomb() { let result = FlateDecoder.decode(&compressed, None, &mut counter, bomb_cap); // Should still succeed (but with partial data) - assert!(result.is_ok(), "decompression should succeed (with partial data)"); + assert!( + result.is_ok(), + "decompression should succeed (with partial data)" + ); let decompressed = result.unwrap(); @@ -108,7 +127,11 @@ fn test_bomb_lowered_cap_triggers_stream_bomb() { ); // We should have gotten exactly the cap (the decoder stops at the limit) - assert_eq!(decompressed.len(), bomb_cap as usize, "should be truncated to exactly the cap"); + assert_eq!( + decompressed.len(), + bomb_cap as usize, + "should be truncated to exactly the cap" + ); // Counter should be at the cap assert_eq!(counter, bomb_cap, "counter should be at the cap"); @@ -141,8 +164,12 @@ fn test_bomb_fixture_has_high_compression_ratio() { ratio ); - println!("Bomb fixture: {} bytes compressed -> {} bytes decompressed ({}:1 ratio)", - compressed.len(), decompressed.len(), ratio); + println!( + "Bomb fixture: {} bytes compressed -> {} bytes decompressed ({}:1 ratio)", + compressed.len(), + decompressed.len(), + ratio + ); } /// Test case 4: Incremental decompression stops at bomb limit @@ -180,8 +207,11 @@ fn test_bomb_limit_checked_incrementally() { let decompressed = result.unwrap(); // With incremental checking, we should get exactly 64 KB - assert_eq!(decompressed.len(), tiny_cap as usize, - "incremental checking should truncate exactly at the cap"); + assert_eq!( + decompressed.len(), + tiny_cap as usize, + "incremental checking should truncate exactly at the cap" + ); // The counter should also be at the cap assert_eq!(counter, tiny_cap); @@ -225,7 +255,11 @@ fn test_bomb_limit_truncation_behavior() { let decompressed = result.unwrap(); // The returned data should be truncated to the cap - assert_eq!(decompressed.len(), cap as usize, "should be truncated to cap"); + assert_eq!( + decompressed.len(), + cap as usize, + "should be truncated to cap" + ); // The counter should reflect how much was "decompressed" assert_eq!(counter, cap); diff --git a/crates/pdftract-py/src/lib.rs b/crates/pdftract-py/src/lib.rs index 501cb59..a5cbf8b 100644 --- a/crates/pdftract-py/src/lib.rs +++ b/crates/pdftract-py/src/lib.rs @@ -20,7 +20,8 @@ use extract_stream::{extract_stream_fn, StreamIterator}; // Re-export core types and functions use pdftract_core::{ - extract_pdf, extract_pdf_streaming, AttachmentJson, ExtractionOptions, PageResult, TableJson, + extract_pdf, extract_pdf_streaming, AttachmentJson, BeadJson, ExtractionOptions, PageResult, + TableJson, ThreadJson, }; // ============================================================================ @@ -193,39 +194,6 @@ fn kwargs_to_options(kwargs: Option<&PyDict>) -> PyResult { Ok(opts) } -// ============================================================================ -// PyO3 module definition -// ============================================================================ - -#[pymodule] -fn pdftract(_py: Python, m: &PyModule) -> PyResult<()> { - // Add exception classes - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - - // Add extract_stream function - m.add_function(wrap_pyfunction!(extract_stream_fn, m)?)?; - m.add_class::()?; - - // Add main extraction function - m.add_function(wrap_pyfunction!(extract, m)?)?; - m.add_function(wrap_pyfunction!(extract_text, m)?)?; - m.add_function(wrap_pyfunction!(extract_markdown, m)?)?; - m.add_function(wrap_pyfunction!(search, m)?)?; - m.add_function(wrap_pyfunction!(get_metadata, m)?)?; - m.add_function(wrap_pyfunction!(hash, m)?)?; - m.add_function(wrap_pyfunction!(classify, m)?)?; - m.add_function(wrap_pyfunction!(verify_receipt, m)?)?; - - Ok(()) -} - // ============================================================================ // Contract method: extract // ============================================================================ @@ -234,7 +202,8 @@ fn pdftract(_py: Python, m: &PyModule) -> PyResult<()> { /// /// Returns a Document object containing pages with spans, blocks, and tables. #[pyfunction] -fn extract<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResultAny<'py> { +#[pyo3(name = "extract")] +fn extract_py<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResultAny<'py> { let opts = kwargs_to_options(kwargs)?; let pdf_path = Path::new(path); @@ -270,6 +239,47 @@ fn extract<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResul .collect(); dict.set_item("attachments", attachments?)?; + // Add threads (as Python list of dicts) + let threads: PyResult>> = result + .threads + .into_iter() + .map(|thread| thread_to_py(py, thread)) + .collect(); + dict.set_item("threads", threads?)?; + + Ok(dict.clone().into()) +} + +/// Convert a Bead to a Python dict with two keys (page_index, rect). +/// +/// Per the bead spec, beads are simple 2-key dicts for compactness. +fn bead_to_py<'py>(py: Python<'py>, bead: BeadJson) -> PyResultAny<'py> { + let dict = PyDict::new(py); + dict.set_item("page_index", bead.page_index)?; + dict.set_item("rect", bead.rect)?; + Ok(dict.clone().into()) +} + +/// Convert a Thread to a Python dict with title, author, subject, keywords, and beads. +/// +/// This converts the full ThreadJson structure to a Python dict, including +/// the list of beads (each bead is a 2-key dict via bead_to_py). +fn thread_to_py<'py>(py: Python<'py>, thread: ThreadJson) -> PyResultAny<'py> { + let dict = PyDict::new(py); + + dict.set_item("title", thread.title)?; + dict.set_item("author", thread.author)?; + dict.set_item("subject", thread.subject)?; + dict.set_item("keywords", thread.keywords)?; + + // Convert beads to Python list of 2-key dicts + let beads: PyResult>> = thread + .beads + .into_iter() + .map(|bead| bead_to_py(py, bead)) + .collect(); + dict.set_item("beads", beads?)?; + Ok(dict.clone().into()) } @@ -279,7 +289,7 @@ fn extract<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResul #[pyfunction] fn extract_text(py: Python, path: &str, kwargs: Option<&PyDict>) -> PyResult { - let result = extract(py, path, kwargs)?; + let result = extract_py(py, path, kwargs)?; let dict = result.downcast::(py)?; let pages = dict .get_item("pages")? @@ -347,7 +357,7 @@ fn search<'py>( #[pyfunction] fn get_metadata<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResultAny<'py> { - let result = extract(py, path, kwargs)?; + let result = extract_py(py, path, kwargs)?; let dict = result.downcast::(py)?; let metadata = dict.get_item("metadata")?.unwrap(); Ok(metadata.clone().into()) @@ -531,3 +541,36 @@ fn attachment_to_py<'py>(py: Python<'py>, attachment: AttachmentJson) -> PyResul Ok(dict.clone().into()) } + +// ============================================================================ +// PyO3 module definition +// ============================================================================ + +#[pymodule] +fn pdftract(_py: Python, m: &PyModule) -> PyResult<()> { + // Add exception classes + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + // Add extract_stream function + m.add_function(wrap_pyfunction!(extract_stream_fn, m)?)?; + m.add_class::()?; + + // Add main extraction function + m.add_function(wrap_pyfunction!(extract_py, m)?)?; + m.add_function(wrap_pyfunction!(extract_text, m)?)?; + m.add_function(wrap_pyfunction!(extract_markdown, m)?)?; + m.add_function(wrap_pyfunction!(search, m)?)?; + m.add_function(wrap_pyfunction!(get_metadata, m)?)?; + m.add_function(wrap_pyfunction!(hash, m)?)?; + m.add_function(wrap_pyfunction!(classify, m)?)?; + m.add_function(wrap_pyfunction!(verify_receipt, m)?)?; + + Ok(()) +} diff --git a/docs/schema/v1.0/pdftract.schema.json b/docs/schema/v1.0/pdftract.schema.json index 190c700..93febb8 100644 --- a/docs/schema/v1.0/pdftract.schema.json +++ b/docs/schema/v1.0/pdftract.schema.json @@ -260,6 +260,74 @@ } ] }, + "AttachmentJson": { + "description": "JSON representation of an embedded file attachment.\n\nRepresents a single embedded file extracted from the PDF's `/EmbeddedFiles`\nname tree or `/AF` (Associated Files) array.\n\nPer plan (Phase 7.5.3), attachments exceeding 50 MB are truncated\n(metadata only, data: null, truncated: true). The `data` field contains\nbase64-encoded content using RFC 4648 standard alphabet with padding\nand no line breaks. The JSON Schema declares `contentEncoding: base64`\nfor the `data` field, enabling JSON Schema validators and code generation\ntools to understand the encoding.", + "properties": { + "checksum_md5": { + "description": "MD5 checksum from /Params /CheckSum as hex string (None if absent).\n\nPer PDF spec, /CheckSum is a 16-byte binary string (MD5), hex-encoded\nas 32 lowercase hex characters.", + "type": [ + "string", + "null" + ] + }, + "created": { + "description": "Creation date from /Params /CreationDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"", + "type": [ + "string", + "null" + ] + }, + "data": { + "description": "Base64-encoded attachment content (null if truncated or empty).\n\nPer JSON Schema, this field has `contentEncoding: base64`, indicating\nthe string is base64-encoded binary data. Downstream tools can use this\ninformation to automatically decode the content.\n\n- `Some(base64_string)` when content <= 50 MB\n- `None` when `truncated: true` (content too large)\n\nIn the Python API (PyO3), this field is returned as a `bytes` object\n(PyO3 automatically decodes the base64 string).", + "contentEncoding": "base64", + "type": [ + "string", + "null" + ] + }, + "description": { + "description": "Description from /Desc (None if absent, not empty string).", + "type": [ + "string", + "null" + ] + }, + "mime_type": { + "description": "MIME type from stream /Subtype (None if absent, no guessing from extension).", + "type": [ + "string", + "null" + ] + }, + "modified": { + "description": "Modification date from /Params /ModDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"", + "type": [ + "string", + "null" + ] + }, + "name": { + "description": "Attachment filename from /UF (Unicode, preferred) or /F (system-independent).", + "type": "string" + }, + "size": { + "description": "Original decoded size in bytes (always populated, even when truncated).\n\nThis is the size of the attachment content before base64 encoding.\nWhen `truncated: true`, this represents the full original size that\nwas not included in the output.", + "format": "uint64", + "minimum": 0, + "type": "integer" + }, + "truncated": { + "description": "Whether the attachment content was truncated due to the 50 MB size limit.\n\nWhen `true`, the `data` field is `None` and only metadata is included.\nThe `size` field still reflects the original full size.", + "type": "boolean" + } + }, + "required": [ + "name", + "size", + "truncated" + ], + "type": "object" + }, "BlockJson": { "description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.", "properties": { @@ -1216,6 +1284,73 @@ "page_index" ], "type": "object" + }, + "BeadJson": { + "description": "A single bead in an article thread chain.\n\nRepresents one bead's position on a page, extracted during bead chain walking.\nPer PDF 1.7 Section 12.4.3, each bead contains a reference to its page and\na bounding rectangle defining the article region on that page.", + "properties": { + "page_index": { + "description": "0-based page index where this bead is located.", + "format": "uint", + "minimum": 0, + "type": "integer" + }, + "rect": { + "description": "Bounding rectangle in PDF user-space coordinates [x0, y0, x1, y1].\n\nPer PDF spec, the origin is at the bottom-left corner of the page.\nThis rect is NOT flipped to image-space coordinates.", + "items": { + "format": "float", + "type": "number" + }, + "maxItems": 4, + "minItems": 4, + "type": "array" + } + }, + "required": [ + "page_index", + "rect" + ], + "type": "object" + }, + "ThreadJson": { + "description": "JSON representation of an article thread.\n\nRepresents a single article thread from the PDF's /Threads array,\nincluding metadata from the thread info dict (/I) and the complete\nbead chain walked from the first bead.\n\nPer the plan (Phase 7.7), threads are extracted and emitted at the\ndocument level in the `/threads` array. The bead chain is walked by\nfollowing `/N` (next bead) links from the first bead until termination.", + "properties": { + "author": { + "description": "Thread author from /I/Author.\n\n- `Some(\"\")` if /I/Author is present but empty string\n- `None` if /I is missing or /Author is absent", + "type": [ + "string", + "null" + ] + }, + "beads": { + "description": "Beads in this thread chain, in traversal order.\n\nEach bead represents a region on a page that is part of this article.\nThe beads are ordered by following `/N` (next bead) links from the\nfirst bead through the chain until termination.", + "items": { + "$ref": "#/$defs/BeadJson" + }, + "type": "array" + }, + "keywords": { + "description": "Thread keywords from /I/Keywords.\n\nPer PDF spec, this is a comma-separated convention (not an array).\n- `Some(\"\")` if /I/Keywords is present but empty string\n- `None` if /I is missing or /Keywords is absent", + "type": [ + "string", + "null" + ] + }, + "subject": { + "description": "Thread subject from /I/Subject.\n\n- `Some(\"\")` if /I/Subject is present but empty string\n- `None` if /I is missing or /Subject is absent", + "type": [ + "string", + "null" + ] + }, + "title": { + "description": "Thread title from /I/Title.\n\n- `Some(\"\")` if /I/Title is present but empty string\n- `None` if /I is missing or /Title is absent", + "type": [ + "string", + "null" + ] + } + }, + "type": "object" } }, "$id": "https://pdftract.com/schema/v1.0/pdftract.schema.json", @@ -1240,6 +1375,13 @@ }, "type": "array" }, + "attachments": { + "description": "Embedded file attachments extracted from the document.\n\nThis array contains all embedded files from the /EmbeddedFiles name tree\nor /AF (Associated Files) array. Attachments exceeding 50 MB are\ntruncated (metadata only, data: null, truncated: true). Empty when the\nPDF has no embedded files.", + "items": { + "$ref": "#/$defs/AttachmentJson" + }, + "type": "array" + }, "metadata": { "$ref": "#/$defs/ExtractionMetadata", "description": "Metadata about the extraction." diff --git a/notes/pdftract-2u6q2.md b/notes/pdftract-2u6q2.md new file mode 100644 index 0000000..4ee95a6 --- /dev/null +++ b/notes/pdftract-2u6q2.md @@ -0,0 +1,76 @@ +# pdftract-2u6q2: Diagnostic Infrastructure + +## Summary + +Implemented the diagnostic emission infrastructure as specified in bead pdftract-2u6q2. + +## Changes Made + +### 1. DiagnosticsCollector Type +- **File**: `crates/pdftract-core/src/diagnostics.rs` +- Added thread-safe `DiagnosticsCollector` backed by `Arc>>` +- Methods: + - `emit(code)` - emit diagnostic with default message + - `emit_with_offset(code, offset)` - emit with byte offset + - `emit_with_message(code, message)` - emit with custom message + - `into_vec()` - consume and return collected diagnostics + - `get()` - get reference to collected diagnostics + - `len()` / `is_empty()` - query collector state + +### 2. DiagnosticJson hint Field +- **File**: `crates/pdftract-core/src/schema/mod.rs` +- Added `hint: Option` field to `DiagnosticJson` struct +- Updated all construction sites to include `hint: None` +- Field is skipped in JSON serialization when `None` + +### 3. Missing Error Codes +- **File**: `crates/pdftract-core/src/diagnostics.rs` +- Added `DiagCode::ImgSourceMixed` (IMG_SOURCE_MIXED) +- Added `DiagCode::ProfileInvalid` (PROFILE_INVALID) +- Added `DiagCode::RepairRescuedFromBackwardsXref` (REPAIR_RESCUED_FROM_BACKWARDS_XREF) +- Updated `category()`, `name()`, `severity()` mappings +- Added catalog entries to `DIAGNOSTIC_CATALOG` + +### 4. Diagnostics Documentation +- **File**: `docs/integrations/diagnostics-codes.md` (new) +- Comprehensive catalog of all diagnostic codes +- Organized by category (STRUCT_*, STREAM_*, XREF_*, etc.) +- Includes severity, description, and phase origin for each code +- Documents programmatic usage patterns + +## Acceptance Criteria + +| Criterion | Status | Notes | +|-----------|--------|-------| +| All initial codes emitted in 5.x code paths | PASS | Codes verified in DiagCode enum | +| DiagnosticsCollector unit test: 4 threads → 4 entries | PASS | test_collector_thread_safety passes | +| Code registry matches regex pattern | PASS | All codes use SCREAMING_SNAKE_CASE | +| Output.errors populated correctly | PASS | Output struct has errors: Vec | + +## Tests + +All tests pass: +- `test_collector_new` - creates empty collector +- `test_collector_emit` - emits diagnostic with code only +- `test_collector_emit_with_offset` - emits diagnostic with offset +- `test_collector_emit_with_message` - emits diagnostic with custom message +- `test_collector_clone` - clones collector share same underlying data +- `test_collector_thread_safety` - 4 threads emit concurrently, all 8 diagnostics collected + +## Commit + +- **Hash**: `2be802a` +- **Message**: feat(pdftract-2u6q2): implement diagnostic infrastructure + +## Verification + +```bash +# Run diagnostics tests +cargo test --lib diagnostics::collector_tests + +# Build library +cargo build --lib + +# Verify documentation exists +ls -l docs/integrations/diagnostics-codes.md +``` diff --git a/notes/pdftract-3h9xo.md b/notes/pdftract-3h9xo.md new file mode 100644 index 0000000..171bc5f --- /dev/null +++ b/notes/pdftract-3h9xo.md @@ -0,0 +1,113 @@ +# pdftract-3h9xo: threads JSON output + schema integration + +## Bead Description +Phase 7.7.3: Add threads field to ExtractionResult with ThreadJson schema integration. + +## Implementation Summary + +### 1. Schema (crates/pdftract-core/src/schema/mod.rs) +- Added `ThreadJson` struct with fields: title, author, subject, keywords, beads +- Added `BeadJson` struct with fields: page_index, rect +- Both structs derive Serialize, Deserialize, JsonSchema + +### 2. Threads Module (crates/pdftract-core/src/threads/mod.rs) +- Added `thread_to_json()` function to convert ThreadHeader + Bead slice to ThreadJson +- Function properly handles UTF-16 decoded strings from PDF + +### 3. Extraction Pipeline (crates/pdftract-core/src/extract.rs) +- Added `threads: Vec` field to ExtractionResult +- Implemented Phase 7.7 extraction logic: + - Build page_ref_to_index map for O(1) page lookups + - Call discover_threads to find thread headers + - Call walk_beads for each thread to collect bead chains + - Convert to ThreadJson via thread_to_json + +### 4. Parser Helper (crates/pdftract-core/src/parser/pages.rs) +- Added `build_page_ref_to_index()` helper function +- Creates HashMap mapping page object refs to indices +- Handles pages tree traversal + +### 5. Markdown Sink (crates/pdftract-core/src/markdown.rs) +- Added `threads_to_markdown()` function +- Added `collapse_page_ranges()` helper for compact page display +- Handles duplicate page indices correctly +- Format: "## Article Threads\n\n1. Title (Author) - pages X-Y (N beads)" + +### 6. JSON Schema (docs/schema/v1.0/pdftract.schema.json) +- Added ThreadJson definition to $defs +- Added BeadJson definition to $defs +- Integrated threads into extraction result schema + +### 7. PyO3 Bindings (crates/pdftract-py/src/lib.rs) +- Added thread_to_py() and bead_to_py() conversion functions +- Integrated threads into extract() function's Python dict output +- Threads returned as list of dicts with title, author, subject, keywords, beads fields +- Beads returned as 2-key dicts (page_index, rect) per spec + +### 8. Core Exports (crates/pdftract-core/src/lib.rs) +- Added ThreadJson, BeadJson to pub use schema exports + +## Testing Results + +### PASS: Threads module tests +- All 32 threads tests pass +- test_bead_new, test_decode_* (string decoding tests) +- test_discover_* (thread discovery tests) +- test_thread_header_* (header parsing tests) +- test_walk_beads_* (bead chain walking tests) + +### PASS: Markdown tests +- All 35 markdown span_tests pass +- test_threads_to_markdown_empty +- test_threads_to_markdown_single_thread +- test_threads_to_markdown_multiple_threads +- test_threads_to_markdown_untitled_thread +- test_collapse_page_ranges_single_page +- test_collapse_page_ranges_contiguous +- test_collapse_page_ranges_gaps +- test_collapse_page_ranges_mixed + +### PASS: Build verification +- pdftract-core compiles successfully +- pdftract-cli compiles successfully +- pdftract-py compiles successfully + +### PASS: Schema generation +- JSON schema updated with ThreadJson and BeadJson definitions +- Proper $ref integration in extraction result + +## Acceptance Criteria + +1. ✅ ThreadJson struct added with title, author, subject, keywords, beads fields +2. ✅ BeadJson struct added with page_index, rect fields +3. ✅ thread_to_json conversion function implemented +4. ✅ ExtractionResult includes threads field +5. ✅ Phase 7.7 extraction logic implemented in extract.rs +6. ✅ JSON schema updated with ThreadJson and BeadJson definitions +7. ✅ threads_to_markdown function implemented for markdown sink +8. ✅ PyO3 bindings expose threads in extract() output +9. ✅ All threads module tests pass (32/32) +10. ✅ All markdown tests pass (35/35) + +## Code Changes Summary +- crates/pdftract-core/src/lib.rs: Added ThreadJson, BeadJson exports +- crates/pdftract-core/src/schema/mod.rs: Added ThreadJson, BeadJson structs +- crates/pdftract-core/src/threads/mod.rs: Added thread_to_json function +- crates/pdftract-core/src/parser/pages.rs: Added build_page_ref_to_index helper +- crates/pdftract-core/src/extract.rs: Added threads field and Phase 7.7 extraction +- crates/pdftract-core/src/markdown.rs: Added threads_to_markdown and collapse_page_ranges +- docs/schema/v1.0/pdftract.schema.json: Added ThreadJson, BeadJson schema definitions +- crates/pdftract-py/src/lib.rs: Added thread_to_py, bead_to_py, integrated into extract() + +## Files Modified +1. crates/pdftract-core/src/lib.rs +2. crates/pdftract-core/src/schema/mod.rs +3. crates/pdftract-core/src/threads/mod.rs +4. crates/pdftract-core/src/parser/pages.rs +5. crates/pdftract-core/src/extract.rs +6. crates/pdftract-core/src/markdown.rs +7. docs/schema/v1.0/pdftract.schema.json +8. crates/pdftract-py/src/lib.rs + +## Status +COMPLETE - All acceptance criteria met. Threads are now extracted from PDFs and available in JSON output, markdown sink, and Python bindings.