The extract_markdown stub was calling extract_text instead of extract_text_fn, causing a compilation error. This fixes the function name to match the exported function from extract_text.rs. This completes the extract_text PyO3 entry point implementation, which was already present in extract_text.rs and lib.rs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
66 lines
2.2 KiB
Rust
66 lines
2.2 KiB
Rust
//! Example: Classify PDF document type.
|
|
//!
|
|
//! Demonstrates page-level classification to determine the extraction
|
|
//! path (Vector, Scanned, Hybrid, or BrokenVector). This is useful for
|
|
//! deciding whether OCR is needed and understanding the document's structure.
|
|
//!
|
|
//! Note: Document-type classification (invoice, receipt, etc.) requires the
|
|
//! `profiles` feature. This example shows page-level classification which
|
|
//! is always available.
|
|
//!
|
|
//! Usage:
|
|
//! cargo run --example classify -- tests/fixtures/sample.pdf
|
|
|
|
use anyhow::Result;
|
|
use pdftract_core::{extract_pdf, ExtractionOptions};
|
|
use std::env;
|
|
use std::path::Path;
|
|
use std::collections::HashMap;
|
|
|
|
fn main() -> Result<()> {
|
|
// Get PDF path from command line, or use a default
|
|
let args: Vec<String> = env::args().collect();
|
|
let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or("tests/fixtures/sample.pdf");
|
|
|
|
// Extract with default options
|
|
let options = ExtractionOptions::default();
|
|
let result = extract_pdf(Path::new(pdf_path), &options)?;
|
|
|
|
// Classify pages by type
|
|
let mut page_types: HashMap<String, usize> = HashMap::new();
|
|
|
|
println!("Page Classification:");
|
|
println!();
|
|
|
|
for page in &result.pages {
|
|
let page_type = page.page_type.as_deref().unwrap_or("unknown");
|
|
|
|
// Count by type
|
|
*page_types.entry(page_type.to_string()).or_insert(0) += 1;
|
|
|
|
println!("Page {}: {}", page.page_number, page_type);
|
|
}
|
|
|
|
// Print summary
|
|
println!();
|
|
println!("Summary:");
|
|
for (ptype, count) in page_types.iter() {
|
|
println!(" {}: {} pages", ptype, count);
|
|
}
|
|
|
|
// Provide guidance based on classification
|
|
println!();
|
|
println!("Extraction Guidance:");
|
|
if page_types.contains_key("scanned") || page_types.contains_key("mixed") {
|
|
println!(" - Consider enabling OCR for scanned/mixed pages");
|
|
println!(" - Use ExtractionOptions {{ ocr_languages: vec![\"eng\".to_string()], ..Default::default() }}");
|
|
}
|
|
if page_types.contains_key("broken_vector") {
|
|
println!(" - Some pages have invisible text; OCR may help");
|
|
}
|
|
if page_types.contains_key("vector") {
|
|
println!(" - Vector text extraction is sufficient");
|
|
}
|
|
|
|
Ok(())
|
|
}
|