pdftract/crates/pdftract-py/src/extract_text.rs
jedarden bb7146cffe fix(pdftract-2uk9z): wrap native module results in typed Python objects
The native PyO3 module returns raw dicts via pythonize, but the Python SDK
API expects typed dataclass objects (Document, Page, Metadata, etc.) to be
consistent with the subprocess fallback and test expectations.

Updated wrapper functions in __init__.py to convert native results:
- extract(): wraps dict in Document.from_dict()
- extract_stream(): wraps yielded page dicts in Page.from_dict()
- get_metadata(): wraps dict in Metadata()
- hash(): wraps string in Fingerprint.from_string()
- classify(): wraps dict in Classification()
- search(): wraps yielded match dicts in Match

The native PyO3 entry points (extract, extract_text, extract_stream) were
already implemented with:
- extract: uses extract_pdf + pythonize for PyDict conversion
- extract_text: uses extract_text for plain String return
- extract_stream: uses extract_pdf_streaming with custom StreamIterator

All kwargs parsing with strict validation (unknown kwargs raise TypeError)
was already in place.

Acceptance criteria:
- pdftract.extract() returns Document object with pages/metadata
- pdftract.extract_text() returns plain text string
- pdftract.extract_stream() yields Page objects
- Unknown kwarg raises TypeError
2026-05-28 21:18:38 -04:00

310 lines
11 KiB
Rust

//! Python extract_text() entry point using PyO3.
//!
//! This module provides the extract_text() function that returns plain text
//! from a PDF, with kwargs parsing into ExtractionOptions, GIL release during
//! extraction, and direct String return (no intermediate dict).
use pyo3::prelude::*;
use pyo3::types::PyDict;
use std::path::Path;
use pdftract_core::{extract_text, ExtractionOptions};
use pdftract_core::options::ReceiptsMode;
/// Allowed kwarg names for strict validation.
const ALLOWED_KWARGS: &[&str] = &[
"ocr",
"ocr_language",
"include_invisible",
"extract_forms",
"extract_attachments",
"readability_threshold",
"password",
"max_decompress_gb",
"full_render",
"receipts",
"cache_dir",
"pages",
"formats",
];
/// Parse Python kwargs into ExtractionOptions.
///
/// This function performs strict validation: unknown kwargs raise PdftractError
/// to catch typos early rather than silently ignoring them.
fn parse_kwargs(kwargs: Option<&PyDict>) -> PyResult<ExtractionOptions> {
let mut opts = ExtractionOptions::default();
if let Some(kwargs) = kwargs {
// Validate that all kwargs are in the allowlist
for key in kwargs.keys() {
let key_str: String = key.extract()?;
if !ALLOWED_KWARGS.contains(&key_str.as_str()) {
return Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(format!(
"Unknown keyword argument '{}'. Allowed: {}",
key_str,
ALLOWED_KWARGS.join(", ")
)));
}
}
// Parse ocr (bool) - No-op for now, OCR is controlled by feature flag
if let Some(ocr) = kwargs.get_item("ocr")? {
let _ocr: bool = ocr.extract()?;
// OCR is controlled by the 'ocr' feature flag in pdftract-core
// This kwarg is accepted for API compatibility but has no effect
}
// Parse ocr_language (list[str] or comma-string)
if let Some(lang) = kwargs.get_item("ocr_language")? {
if let Ok(lang_list) = lang.extract::<Vec<String>>() {
opts.ocr_language = lang_list;
} else if let Ok(lang_str) = lang.extract::<String>() {
// Split on comma if provided as string
opts.ocr_language = lang_str
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
} else {
return Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(
"ocr_language must be a list of strings or a comma-separated string",
));
}
}
// Parse include_invisible (bool) → output.include_invisible
if let Some(include_invisible) = kwargs.get_item("include_invisible")? {
opts.output.include_invisible = include_invisible.extract()?;
}
// Parse password (str) → password: Option<SecretString>
if let Some(password) = kwargs.get_item("password")? {
let pwd: String = password.extract()?;
opts.password = Some(secrecy::SecretString::new(pwd.into()));
}
// Parse max_decompress_gb (int) → max_decompress_bytes: u64
if let Some(max_gb) = kwargs.get_item("max_decompress_gb")? {
let gb: u64 = max_gb.extract()?;
opts.max_decompress_bytes = gb.saturating_mul(1024 * 1024 * 1024);
}
// Parse pages (str) → pages: Option<String>
if let Some(pages) = kwargs.get_item("pages")? {
opts.pages = Some(pages.extract()?);
}
// Parse extract_forms (bool) - No-op, forms are always extracted
if let Some(extract_forms) = kwargs.get_item("extract_forms")? {
let _extract_forms: bool = extract_forms.extract()?;
// Forms are always extracted; this kwarg is accepted for API compatibility
}
// Parse extract_attachments (bool) - No-op, attachments are always extracted
if let Some(extract_attachments) = kwargs.get_item("extract_attachments")? {
let _extract_attachments: bool = extract_attachments.extract()?;
// Attachments are always extracted; this kwarg is accepted for API compatibility
}
// Parse readability_threshold (float) - Not implemented yet
if let Some(readability_threshold) = kwargs.get_item("readability_threshold")? {
let _readability_threshold: f64 = readability_threshold.extract()?;
// Readability threshold is not yet implemented in pdftract-core
}
// Parse full_render (bool) → full_render: bool
if let Some(full_render) = kwargs.get_item("full_render")? {
opts.full_render = full_render.extract()?;
}
// Parse receipts (str) → receipts: ReceiptsMode
if let Some(receipts) = kwargs.get_item("receipts")? {
let receipts_str: String = receipts.extract()?;
opts.receipts = ReceiptsMode::from_str(&receipts_str)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e))?;
}
// Parse cache_dir (str) - Not implemented yet
if let Some(cache_dir) = kwargs.get_item("cache_dir")? {
let _cache_dir: String = cache_dir.extract()?;
// Cache dir is not yet implemented in pdftract-core
}
// Parse formats (list[str]) - Not implemented yet
if let Some(formats) = kwargs.get_item("formats")? {
let _formats: Vec<String> = formats.extract()?;
// Output format selection is not yet implemented
}
}
Ok(opts)
}
/// Extract plain text from a PDF, returning a String.
///
/// This is the fast path for RAG ingest pipelines that just want the text body.
/// It returns a bare String, avoiding the cost of serializing the full Document
/// to JSON and re-parsing in Python.
///
/// This function is wrapped by `#[pyfunction]` in lib.rs; do not add the attribute here.
///
/// # Arguments
///
/// * `py` - Python GIL token
/// * `path` - Path to the PDF file (local file or HTTPS URL)
/// * `kwargs` - Optional extraction options (see ALLOWED_KWARGS)
///
/// # Returns
///
/// A Python string containing the extracted text. Span texts are concatenated
/// in reading order, each followed by a newline (matching `pdftract extract --text`).
///
/// # Examples
///
/// ```python
/// import pdftract
///
/// # Basic text extraction
/// text = pdftract.extract_text("document.pdf")
/// print(f"Extracted {len(text)} characters")
///
/// # With page range
/// text = pdftract.extract_text("doc.pdf", pages="1-5")
///
/// # With invisible text included
/// text = pdftract.extract_text("doc.pdf", include_invisible=True)
///
/// # With password for encrypted PDF
/// text = pdftract.extract_text("encrypted.pdf", password="secret123")
/// ```
///
/// # Errors
///
/// - `PdftractError` - Base class for all PDF processing errors
/// - `EncryptionError` - PDF is encrypted and password is wrong or missing
/// - `CorruptPdfError` - PDF file is malformed or invalid
/// - `SourceUnreachableError` - Remote PDF could not be fetched
/// - `TlsError` - TLS handshake failed for remote PDF
///
/// # Thread Safety
///
/// The GIL is released during the blocking extraction operation, allowing
/// other Python threads to run concurrently.
pub fn extract_text_fn(py: Python<'_>, path: &str, kwargs: Option<&PyDict>) -> PyResult<String> {
// Parse kwargs into ExtractionOptions with strict validation
let opts = parse_kwargs(kwargs)?;
// Resolve path (local file or URL)
let pdf_path = Path::new(path);
// Run extraction with GIL released so other Python threads can run
let text = py
.allow_threads(|| extract_text(pdf_path, &opts))
.map_err(|e| {
// Map anyhow::Error to appropriate Python exception
let msg = e.to_string();
let err_str = msg.to_lowercase();
if err_str.contains("encrypted") || err_str.contains("password") {
PyErr::new::<crate::EncryptionError, _>(msg)
} else if err_str.contains("corrupt") || err_str.contains("invalid") {
PyErr::new::<crate::CorruptPdfError, _>(msg)
} else if err_str.contains("tls") || err_str.contains("certificate") || err_str.contains("ssl") {
PyErr::new::<crate::TlsError, _>(msg)
} else if err_str.contains("network") || err_str.contains("interrupted") {
PyErr::new::<crate::RemoteFetchInterruptedError, _>(msg)
} else if err_str.contains("unreachable") || err_str.contains("not found") {
PyErr::new::<crate::SourceUnreachableError, _>(msg)
} else {
PyErr::new::<crate::PdftractError, _>(msg)
}
})?;
Ok(text)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_kwargs_empty() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert!(opts.pages.is_none());
assert_eq!(opts.output.include_invisible, false);
});
}
#[test]
fn test_parse_kwargs_unknown_kwarg() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("bogus_kwarg", 42).unwrap();
let result = parse_kwargs(Some(kwargs));
assert!(result.is_err());
});
}
#[test]
fn test_parse_kwargs_include_invisible() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("include_invisible", true).unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert_eq!(opts.output.include_invisible, true);
});
}
#[test]
fn test_parse_kwargs_password() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("password", "test123").unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert!(opts.password.is_some());
});
}
#[test]
fn test_parse_kwargs_max_decompress_gb() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("max_decompress_gb", 2).unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert_eq!(opts.max_decompress_bytes, 2 * 1024 * 1024 * 1024);
});
}
#[test]
fn test_parse_kwargs_pages() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("pages", "1-5,7,12-15").unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert_eq!(opts.pages, Some("1-5,7,12-15".to_string()));
});
}
#[test]
fn test_parse_kwargs_receipts() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("receipts", "lite").unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert_eq!(opts.receipts, ReceiptsMode::Lite);
});
}
#[test]
fn test_parse_kwargs_full_render() {
Python::with_gil(|py| {
let kwargs = PyDict::new(py);
kwargs.set_item("full_render", true).unwrap();
let opts = parse_kwargs(Some(kwargs)).unwrap();
assert_eq!(opts.full_render, true);
});
}
}