pdftract/crates/pdftract-core/examples/hash.rs
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

95 lines
3.5 KiB
Rust

//! Example: Compute PDF structural fingerprint.
//!
//! Demonstrates fingerprint computation for PDF document identification.
//! The fingerprint is a reproducible 256-bit hash that identifies the
//! semantic content independent of metadata churn.
//!
//! Usage:
//! cargo run --example hash -- tests/fixtures/sample.pdf
use anyhow::Result;
use pdftract_core::fingerprint::{
compute_fingerprint, ContentStreamData, FingerprintInput, PageFingerprintData,
};
use pdftract_core::parser::catalog::parse_catalog;
use pdftract_core::parser::pages::flatten_page_tree;
use pdftract_core::parser::stream::{FileSource, PdfSource};
use pdftract_core::parser::xref::{load_xref_with_prev_chain, XrefResolver};
use std::env;
use std::path::Path;
fn main() -> Result<()> {
// Get PDF path from command line, or use a default
let args: Vec<String> = env::args().collect();
let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or("tests/fixtures/sample.pdf");
// Open the PDF
let source = FileSource::open(Path::new(pdf_path))?;
// Find the startxref offset
let source_len = source.len()?;
let tail_len = 1024.min(source_len as usize) as u64;
let tail_start = source_len - tail_len;
let tail_data = source.read_at(tail_start, tail_len as usize)?;
let startxref_pos = tail_data
.windows(9)
.rposition(|w| w == b"startxref")
.ok_or_else(|| anyhow::anyhow!("startxref not found"))?;
let offset_str = std::str::from_utf8(&tail_data[startxref_pos + 9..])
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in startxref"))?
.split_whitespace()
.next()
.ok_or_else(|| anyhow::anyhow!("No offset after startxref"))?;
let startxref_offset: u64 = offset_str
.parse()
.map_err(|_| anyhow::anyhow!("Invalid startxref offset"))?;
// Load xref and parse catalog
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
let resolver = XrefResolver::from_section(xref_section.clone());
let root_ref = xref_section
.trailer
.as_ref()
.and_then(|t| t.get("Root"))
.and_then(|o| o.as_ref())
.ok_or_else(|| anyhow::anyhow!("No /Root in trailer"))?;
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource))
.map_err(|d| anyhow::anyhow!("Catalog parse failed: {}", d.first().map(|d| d.message.as_ref()).unwrap_or("unknown")))?;
// Flatten page tree
let pages = flatten_page_tree(&resolver, catalog.pages_ref)
.map_err(|d| anyhow::anyhow!("Page tree parse failed: {}", d.first().map(|d| d.message.as_ref()).unwrap_or("unknown")))?;
// Build fingerprint input
let page_count = pages.len() as u32;
let fingerprint_pages = pages
.iter()
.map(|page| PageFingerprintData {
content_streams: page.contents.iter().map(|&r| ContentStreamData::Indirect(r)).collect(),
resources: None,
media_box: page.media_box,
crop_box: page.crop_box,
rotate: page.rotate,
})
.collect();
let fingerprint_input = FingerprintInput {
page_count,
pages: fingerprint_pages,
struct_tree_root_ref: catalog.struct_tree_root_ref,
is_tagged: catalog.mark_info.is_tagged,
catalog_flags: Default::default(),
};
// Compute fingerprint
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver, Some(&source as &dyn PdfSource));
println!("{}", fingerprint);
Ok(())
}