pdftract/test_debug_pdf.rs
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

62 lines
2.1 KiB
Rust

use pdftract_core::parser::xref::load_xref_with_prev_chain;
use pdftract_core::parser::stream::{FileSource, PdfSource};
use std::path::Path;
fn main() {
let pdf_path = Path::new("crates/pdftract-core/tests/document_model/fixtures/ocg_default_off.pdf");
// Open the PDF file
let source = FileSource::open(pdf_path).expect("Failed to open PDF file");
// Find the startxref offset
let startxref_offset = find_startxref(&source).expect("Failed to find startxref offset");
println!("startxref offset: {}", startxref_offset);
// Try to load the xref
let xref = load_xref_with_prev_chain(&source, startxref_offset);
println!("Xref trailer: {:?}", xref.trailer);
if let Some(trailer) = &xref.trailer {
println!("Trailer keys: {:?}", trailer.keys().collect::<Vec<_>>());
if let Some(root) = trailer.get("Root") {
println!("Root: {:?}", root);
} else {
println!("No Root key in trailer!");
}
} else {
println!("No trailer found!");
}
}
fn find_startxref(source: &FileSource) -> Result<u64, Box<dyn std::error::Error>> {
// Read the last 1KB of the file to find startxref
let file_size = source.len()?;
let read_size = 1024.min(file_size);
let read_offset = file_size - read_size;
let tail = source.read_at(read_offset, read_size as usize)?;
let tail_str = std::str::from_utf8(&tail)?;
// Find "startxref" keyword
if let Some(pos) = tail_str.find("startxref") {
let offset_start = pos + "startxref".len();
// Find the offset after startxref (whitespace then number)
let offset_str = &tail_str[offset_start..];
let offset_str = offset_str.trim();
if let Some(end) = offset_str.find(|c: char| !c.is_ascii_digit() && c != '-') {
let offset_str = &offset_str[..end];
if let Ok(offset) = offset_str.parse::<u64>() {
return Ok(offset);
}
}
// Try to parse the entire line as the offset
if let Ok(offset) = offset_str.parse::<u64>() {
return Ok(offset);
}
}
Err("startxref not found".into())
}