Add ProcessingMode enum and process_with_mode function to Phase 3 content stream processor: - ProcessingMode::Normal: Extract text with full Unicode resolution - ProcessingMode::PositionHint: Emit U+FFFD with confidence=0.0, but compute bboxes correctly for use by 5.5.2 validation filter PositionHint mode skips ToUnicode CMap lookup, making it ~10% faster than Normal mode. The text matrix advances identically in both modes. Unit tests verify: - Same input PDF, Normal vs PositionHint -> bboxes identical, Unicode differs - All PositionHint glyphs have unicode=U+FFFD and confidence=0.0 - Text positioning operators (Tm, Td, TD, T*) work correctly Closes: pdftract-5u7h
74 lines
2.3 KiB
Rust
74 lines
2.3 KiB
Rust
//! pdftract-core — Core PDF parsing and text extraction primitives.
|
|
//!
|
|
//! This crate provides the foundational data structures and parsers for
|
|
//! processing PDF documents, including the lexer, object parser, and
|
|
//! text extraction engines.
|
|
|
|
pub mod attachment;
|
|
pub mod cache;
|
|
pub mod classify;
|
|
pub mod content_stream;
|
|
pub mod diagnostics;
|
|
pub mod document;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod dpi;
|
|
pub mod extract;
|
|
pub mod fingerprint;
|
|
pub mod font;
|
|
pub mod graphics_state;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod hybrid;
|
|
pub mod layout;
|
|
pub mod markdown;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod ocr;
|
|
pub mod options;
|
|
pub mod parser;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod preprocess;
|
|
#[cfg(feature = "profiles")]
|
|
pub mod profiles;
|
|
pub mod receipts;
|
|
#[cfg(feature = "ocr")]
|
|
pub mod render;
|
|
#[cfg(feature = "remote")]
|
|
pub mod url_validation;
|
|
|
|
// Re-export has_full_render for runtime feature detection
|
|
#[cfg(all(feature = "ocr", feature = "full-render"))]
|
|
pub use render::pdfium_path::has_full_render;
|
|
pub mod schema;
|
|
pub mod semaphore;
|
|
pub mod signature;
|
|
pub mod table;
|
|
|
|
// Re-export key types for convenience
|
|
pub use document::{PageExtraction, PageIter, PdfExtractor};
|
|
pub use extract::{
|
|
extract_pdf, extract_pdf_ndjson, ExtractionMetadata, ExtractionResult, PageResult,
|
|
};
|
|
pub use font::std14::{get_std14_metrics, NamedEncoding, Std14Metrics};
|
|
pub use markdown::{block_to_markdown, page_to_markdown, parse_anchors, Anchor};
|
|
pub use options::{ExtractionOptions, ReceiptsMode};
|
|
pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX};
|
|
pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson};
|
|
pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector};
|
|
|
|
#[cfg(feature = "ocr")]
|
|
pub use dpi::{select_dpi, FontSizeSpan, Pdf1Filter};
|
|
#[cfg(feature = "ocr")]
|
|
pub use hybrid::{
|
|
compute_cell_crops, compute_iou, crop_cell_from_page, get_hybrid_cells,
|
|
merge_vector_and_ocr_spans, CellCrop, Span, SpanSource,
|
|
};
|
|
#[cfg(feature = "ocr")]
|
|
pub use ocr::{
|
|
borrow_or_init, calculate_wer, detect_available_languages, init_count, parse_hocr,
|
|
reset_init_count, run_tesseract, run_tesseract_on_cell, validate_ocr_languages, HocrWord,
|
|
TessOpts,
|
|
};
|
|
#[cfg(feature = "ocr")]
|
|
pub use preprocess::{
|
|
add_border_padding, binarize_otsu, binarize_sauvola, denoise_median, deskew,
|
|
normalize_contrast, preprocess, ImageSource,
|
|
};
|