diff --git a/crates/pdftract-core/src/layout/mod.rs b/crates/pdftract-core/src/layout/mod.rs index 9604cc0..64f72c5 100644 --- a/crates/pdftract-core/src/layout/mod.rs +++ b/crates/pdftract-core/src/layout/mod.rs @@ -9,6 +9,7 @@ //! - Readability aggregation (readability.rs) //! - English wordlist for dict coverage scoring (wordlist.rs) //! - Text correction pipeline (correction.rs) +//! - Watermark and Formula stub classifiers (watermark_formula.rs) //! //! Phase 4 organizes extracted text into semantic blocks (paragraphs, //! headings, figures, captions, etc.) based on spatial and font metrics. @@ -17,9 +18,11 @@ pub mod caption; pub mod code; pub mod columns; pub mod correction; +pub mod header_footer; pub mod line; pub mod readability; pub mod reading_order; +pub mod watermark_formula; pub mod wordlist; pub use caption::{classify_caption, classify_page_captions, Block, PageContext}; @@ -27,12 +30,14 @@ pub use code::{ classify_code, classify_page_code_blocks, is_fixed_pitch_flag, is_monospace_font_name, is_monospace_span, MonospaceSpan, }; -pub use columns::{assign_columns_to_lines, assign_columns_to_spans, build_x0_histogram, Column}; +pub use columns::{assign_columns_to_lines, assign_columns_to_spans, build_x0_histogram, Column, ColumnGap}; pub use correction::{detect_and_repair_mojibake, repair_hyphenation, HyphenableSpan}; +pub use header_footer::detect_headers_and_footers; pub use line::{ cluster_spans_into_lines, compute_baseline, group_lines_into_blocks, union_bboxes, BlockInput, HasBBox, HasFontSize, Line, LineDirection, LineMetadata, }; pub use readability::{aggregate_page_readability, ScoredSpan}; pub use reading_order::{xy_cut, BlockWithBBox, HasBBox as HasBBoxForOrder, XYCutResult}; +pub use watermark_formula::{classify_formula, classify_watermark}; pub use wordlist::is_english_word; diff --git a/crates/pdftract-core/src/layout/watermark_formula.rs b/crates/pdftract-core/src/layout/watermark_formula.rs new file mode 100644 index 0000000..e2c4c74 --- /dev/null +++ b/crates/pdftract-core/src/layout/watermark_formula.rs @@ -0,0 +1,164 @@ +//! Watermark and Formula block classifiers (Phase 7 stubs). +//! +//! This module provides placeholder classifiers for Watermark and Formula block kinds. +//! Full detection implementation is deferred to Phase 7. +//! +//! ## Phase 7 Research Notes +//! +//! Watermark detection research: `docs/research/watermark-and-background-separation.md` +//! Formula detection research: See Phase 7.2 specification (math notation, OpenType Math tags) +//! +//! ## Current Implementation (Phase 4) +//! +//! - `classify_watermark`: Always returns `false` (no blocks classified as watermarks) +//! - `classify_formula`: Always returns `false` (no blocks classified as formulas) +//! +//! These stubs exist so that downstream consumers (JSON schema, markdown, profile +//! extraction) can be coded against the FULL taxonomy without breaking changes later. +//! +//! ## Phase 7 Implementation Plan +//! +//! ### Watermark Detection (Phase 7.1) +//! +//! TODO: Implement full watermark detection based on: +//! - Diagonal/rotated text spans +//! - Large font size spanning full page +//! - Low opacity or transparency +//! - Repeated content across pages (background patterns) +//! - Centered page-position text (e.g., "DRAFT", "CONFIDENTIAL") +//! +//! ### Formula Detection (Phase 7.2) +//! +//! TODO: Implement full formula detection based on: +//! - OpenType Math tags in PDF structure tree +//! - Monospace math fonts (e.g., Latin Modern Math) +//! - Mathematical notation patterns (symbols, operators) +//! - Adjacent to "Equation" captions +//! +//! See plan.md Phase 7.1 (watermark) and Phase 7.2 (formula) for full specifications. + +use crate::layout::line::Block; + +/// Classify a block as a watermark. +/// +/// This is a Phase 4 stub that always returns `false`. +/// Full watermark detection is deferred to Phase 7. +/// +/// # Arguments +/// +/// * `_block` - The block to classify (unused in stub) +/// +/// # Returns +/// +/// Always returns `false` (no blocks classified as watermarks in Phase 4). +/// +/// # Phase 7 Implementation +/// +/// TODO: Implement watermark detection based on: +/// - Diagonal/rotated text (check LineMetadata rotation) +/// - Large font size (> 2x body median) +/// - Low opacity (check span alpha/transparency) +/// - Page-center positioning +/// - Cross-page repetition detection +/// +/// See `docs/research/watermark-and-background-separation.md` for research notes. +pub fn classify_watermark(_block: &Block) -> bool { + // Phase 4 stub: always return false + // Phase 7 will implement full detection logic + false +} + +/// Classify a block as a formula/math block. +/// +/// This is a Phase 4 stub that always returns `false`. +/// Full formula detection is deferred to Phase 7. +/// +/// # Arguments +/// +/// * `_block` - The block to classify (unused in stub) +/// +/// # Returns +/// +/// Always returns `false` (no blocks classified as formulas in Phase 4). +/// +/// # Phase 7 Implementation +/// +/// TODO: Implement formula detection based on: +/// - OpenType Math structure tags (from PDF StructTreeRoot) +/// - Math font detection (Latin Modern Math, STIX Math, etc.) +/// - Mathematical symbol patterns (∫, ∑, ∂, etc.) +/// - Adjacent to "Equation" or "Formula" captions +/// - Vertical stacking patterns (fractions, matrices) +/// +/// See plan.md Phase 7.2 for full specification. +pub fn classify_formula(_block: &Block) -> bool { + // Phase 4 stub: always return false + // Phase 7 will implement full detection logic + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::layout::line::Block; + + #[test] + fn test_classify_watermark_always_false() { + // Create a dummy block for testing + let dummy_block = Block { + lines: vec![], + kind: "test".to_string(), + text: String::new(), + bbox: [0.0, 0.0, 100.0, 20.0], + median_font_size: 12.0, + metadata: None, + }; + // Stub should always return false + assert_eq!(classify_watermark(&dummy_block), false); + } + + #[test] + fn test_classify_formula_always_false() { + // Create a dummy block for testing + let dummy_block = Block { + lines: vec![], + kind: "test".to_string(), + text: String::new(), + bbox: [0.0, 0.0, 100.0, 20.0], + median_font_size: 12.0, + metadata: None, + }; + // Stub should always return false + assert_eq!(classify_formula(&dummy_block), false); + } + + #[test] + fn test_watermark_stub_documentation() { + // Verify the stub exists and compiles + // This test documents the Phase 4 behavior + let dummy_block = Block { + lines: vec![], + kind: "test".to_string(), + text: String::new(), + bbox: [0.0, 0.0, 100.0, 20.0], + median_font_size: 12.0, + metadata: None, + }; + assert!(!classify_watermark(&dummy_block)); + } + + #[test] + fn test_formula_stub_documentation() { + // Verify the stub exists and compiles + // This test documents the Phase 4 behavior + let dummy_block = Block { + lines: vec![], + kind: "test".to_string(), + text: String::new(), + bbox: [0.0, 0.0, 100.0, 20.0], + median_font_size: 12.0, + metadata: None, + }; + assert!(!classify_formula(&dummy_block)); + } +} diff --git a/notes/pdftract-3jekw.md b/notes/pdftract-3jekw.md new file mode 100644 index 0000000..2d2f195 --- /dev/null +++ b/notes/pdftract-3jekw.md @@ -0,0 +1,75 @@ +# pdftract-3jekw: Watermark / Formula Detection Stubs (Phase 7 Deferred) + +## Work Completed + +### 1. Module Created +Created `crates/pdftract-core/src/layout/watermark_formula.rs` with stub implementations: +- `classify_watermark(block) -> bool`: Always returns `false` (Phase 4 stub) +- `classify_formula(block) -> bool`: Always returns `false` (Phase 4 stub) + +### 2. Module Integration +Updated `crates/pdftract-core/src/layout/mod.rs`: +- Added module declaration: `pub mod watermark_formula;` +- Added public exports: `pub use watermark_formula::{classify_formula, classify_watermark};` +- Updated module documentation to reference the stub classifiers + +### 3. Module Documentation +The module includes comprehensive documentation: +- Links to Phase 7 research notes for watermark detection (`docs/research/watermark-and-background-separation.md`) +- References plan.md Phase 7.1 (watermark) and Phase 7.2 (formula) specifications +- TODO comments outlining the full implementation requirements + +### 4. Tests +Module includes 4 tests verifying stub behavior: +- `test_classify_watermark_always_false`: Verifies watermark stub returns false +- `test_classify_formula_always_false`: Verifies formula stub returns false +- `test_watermark_stub_documentation`: Documents Phase 4 behavior +- `test_formula_stub_documentation`: Documents Phase 4 behavior + +## Acceptance Criteria + +| Criterion | Status | Notes | +|-----------|--------|-------| +| BlockKind::Watermark variant exists | PASS | Already present in `parser/struct_tree.rs:1424` | +| BlockKind::Formula variant exists | PASS | Already present in `parser/struct_tree.rs:1422` | +| classify_watermark always false | PASS | Stub function returns `false` | +| classify_formula always false | PASS | Stub function returns `false` | +| No v0.1.0 block has kind=Watermark or Formula | PASS | Stubs ensure no blocks are classified | + +## Plan References +- Phase 4.4 (line 1709): Block formation and kind assignment +- Phase 4.6 (line 1752): Watermark exclusion note ("Prior to Phase 7, watermarks are not excluded from --text output; kind: 'watermark' blocks are not emitted") +- Phase 7.1: Watermark detection (deferred) +- Phase 7.2: Formula detection (deferred) + +## Implementation Details + +### BlockKind Enum (existing) +```rust +// crates/pdftract-core/src/parser/struct_tree.rs +pub enum BlockKind { + // ... + Formula, // Line 1422 + Watermark, // Line 1424 (commented: "Phase 7 stub - always false") + // ... +} +``` + +### Stub Functions (new) +```rust +// crates/pdftract-core/src/layout/watermark_formula.rs +pub fn classify_watermark(_block: &Block) -> bool { + false // Phase 4 stub +} + +pub fn classify_formula(_block: &Block) -> bool { + false // Phase 4 stub +} +``` + +## Verification Note +The stubs are correctly implemented and will be upgraded to full detection logic in Phase 7. The existence of these stubs allows downstream consumers (JSON schema, markdown, profile extraction) to be coded against the full taxonomy without breaking changes later. + +## Files Modified +- `crates/pdftract-core/src/layout/watermark_formula.rs` (new) +- `crates/pdftract-core/src/layout/mod.rs` (module declaration and exports)