pdftract/test_stream_decode.rs
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

132 lines
4.1 KiB
Rust

use pdftract_core::parser::lexer::Lexer;
use std::env;
use std::fs::File;
use std::io::Read;
use std::path::Path;
fn decode_flate(data: &[u8]) -> Result<Vec<u8>, String> {
use flate2::read::DeflateDecoder;
use std::io::Read;
let mut decoder = DeflateDecoder::new(data);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed).map_err(|e| format!("Decompression failed: {}", e))?;
Ok(decompressed)
}
fn find_and_decode_stream(pdf_data: &[u8]) -> Option<Vec<u8>> {
let stream_start = pdf_data.windows(7).position(|w| w == b"stream\n")?;
let start = stream_start + 7;
let end = pdf_data[start..].windows(9).position(|w| w == b"endstream")? + start;
let compressed = &pdf_data[start..end];
// Try deflate decompression
match decode_flate(compressed) {
Ok(decompressed) => Some(decompressed),
Err(e) => {
eprintln!("Decompression error: {}", e);
None
}
}
}
fn normalize_content(bytes: &[u8]) -> Vec<u8> {
if bytes.is_empty() {
return Vec::new();
}
let mut lexer = Lexer::new(bytes);
let mut result = Vec::new();
let mut first_token = true;
while let Some(token) = lexer.next_token() {
match token {
pdftract_core::parser::lexer::Token::Eof => break,
_ => {
if !first_token {
result.push(b' ');
}
first_token = false;
serialize_token(&mut result, &token);
}
}
}
result
}
fn serialize_token(output: &mut Vec<u8>, token: &pdftract_core::parser::lexer::Token) {
use pdftract_core::parser::lexer::Token;
match token {
Token::Bool(true) => output.extend_from_slice(b"true"),
Token::Bool(false) => output.extend_from_slice(b"false"),
Token::Integer(i) => {
let s = i.to_string();
output.extend_from_slice(s.as_bytes());
}
Token::Real(r) => {
let s = format!("{:.6}", r);
output.extend_from_slice(s.as_bytes());
}
Token::String(bytes) => {
output.push(b'(');
for &byte in bytes.as_ref() {
match byte {
b'(' | b')' | b'\\' => {
output.push(b'\\');
output.push(byte);
}
_ => output.push(byte),
}
}
output.push(b')');
}
Token::Name(bytes) => {
output.push(b'/');
output.extend_from_slice(bytes);
}
Token::ArrayStart => output.push(b'['),
Token::ArrayEnd => output.push(b']'),
Token::DictStart => output.extend_from_slice(b"<<"),
Token::DictEnd => output.extend_from_slice(b">>"),
Token::Stream => output.extend_from_slice(b"stream"),
Token::EndStream => output.extend_from_slice(b"endstream"),
Token::Obj => output.extend_from_slice(b"obj"),
Token::EndObj => output.extend_from_slice(b"endobj"),
Token::IndirectRef => output.push(b'R'),
Token::Null => output.extend_from_slice(b"null"),
Token::Keyword(bytes) => output.extend_from_slice(bytes),
Token::Eof => {}
}
}
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: {} <pdf-path>", args[0]);
return;
}
let pdf_path = Path::new(&args[1]);
let mut pdf_data = Vec::new();
if let Err(e) = File::open(pdf_path).and_then(|mut f| f.read_to_end(&mut pdf_data)) {
eprintln!("Failed to read PDF: {}", e);
return;
}
if let Some(decoded) = find_and_decode_stream(&pdf_data) {
println!("Decoded stream bytes:");
println!("{:?}", decoded);
println!();
let normalized = normalize_content(&decoded);
println!("Normalized content:");
println!("{}", String::from_utf8_lossy(&normalized));
println!("Normalized bytes:");
println!("{:?}", normalized);
} else {
eprintln!("Failed to find/decode stream");
}
}