Fix the critical 5x3 bordered table test to match acceptance criteria (5 rows × 3 columns = row_ys.len() == 6, col_xs.len() == 4). Add missing unit tests: - test_detect_nested_rectangles: tests handling of nested rectangles - test_detect_disjoint_tables: tests detection of multiple disjoint tables Add Criterion benchmark for table detection performance. Results: ~772 µs for 1000 segments (well under 5 ms requirement). All 35 table module tests pass. Acceptance criteria: - ✅ Detector emits GridCandidate for every closed grid of >= 4 cells - ✅ Critical test: 5x3 bordered table with row_ys.len()==6, col_xs.len()==4 - ✅ Unit tests: single rectangle, nested rectangles, mixed text+rules, glyph-path noise - ✅ Public TableDetector::detect_line_based(&PageContext) -> Vec<GridCandidate> - ✅ Benchmark: < 5 ms on 1000-segment page Refs: pdftract-88sk, plan section 7.2 line 2571 Co-Authored-By: Claude Code <noreply@anthropic.com>
46 lines
1.6 KiB
Rust
46 lines
1.6 KiB
Rust
//! pdftract-core — Core PDF parsing and text extraction primitives.
|
|
//!
|
|
//! This crate provides the foundational data structures and parsers for
|
|
//! processing PDF documents, including the lexer, object parser, and
|
|
//! text extraction engines.
|
|
|
|
pub mod cache;
|
|
pub mod classify;
|
|
pub mod diagnostics;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod dpi;
|
|
pub mod document;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod preprocess;
|
|
pub mod extract;
|
|
pub mod fingerprint;
|
|
pub mod font;
|
|
pub mod graphics_state;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod hybrid;
|
|
pub mod options;
|
|
pub mod parser;
|
|
pub mod receipts;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod render;
|
|
|
|
// Re-export has_full_render for runtime feature detection
|
|
#[cfg(all(feature = "ocr", feature = "full-render"))]
|
|
pub use render::pdfium_path::has_full_render;
|
|
pub mod schema;
|
|
pub mod semaphore;
|
|
pub mod table;
|
|
|
|
// Re-export key types for convenience
|
|
pub use document::{PdfExtractor, PageIter, PageExtraction};
|
|
pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata};
|
|
pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
|
|
pub use options::{ExtractionOptions, ReceiptsMode};
|
|
pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
|
|
pub use schema::{SpanJson, BlockJson, ExtractionQuality};
|
|
pub use table::{TableDetector, PageContext as TablePageContext, GridCandidate};
|
|
|
|
#[cfg(feature = "ocr")]
|
|
pub use dpi::{Pdf1Filter, FontSizeSpan, select_dpi};
|
|
#[cfg(feature = "ocr")]
|
|
pub use hybrid::{Span, SpanSource, compute_iou, merge_vector_and_ocr_spans, crop_cell_from_page, get_hybrid_cells, compute_cell_crops, CellCrop};
|