feat(pdftract-3j2u): implement 50 MB size limit + base64 encoding for attachments
- Add attachments field to ExtractionResult struct - Implement extract_attachments helper function to walk /AF array - Add base64 encoding for attachment content in AttachmentBuilder::into_json - Update result_to_json to include attachments in output - Add PyO3 bindings for attachments with base64 data decoded to bytes - Export AttachmentJson from pdftract-core root - Add base64 dependency to pdftract-core and pdftract-py Per plan 7.5.3: - Attachments > 50 MB are truncated (metadata only, data: null, truncated: true) - Base64 encoding uses RFC 4648 standard alphabet with padding - CLI --text mode excludes attachments (existing behavior maintained) - JSON sink includes attachments array Closes: pdftract-3j2u Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
92b0643331
commit
bf9a19f652
6 changed files with 184 additions and 6 deletions
|
|
@ -9,6 +9,7 @@ publish = true
|
|||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
hex = "0.4"
|
||||
image = { version = "0.25", optional = true }
|
||||
url = { version = "2.5", optional = true }
|
||||
|
|
|
|||
|
|
@ -19,6 +19,13 @@ use crate::parser::object::ObjRef;
|
|||
use crate::parser::stream::{ExtractionOptions, PdfSource, DEFAULT_MAX_DECOMPRESS_BYTES};
|
||||
use crate::parser::xref::XrefResolver;
|
||||
|
||||
use base64::engine::Engine;
|
||||
|
||||
/// Base64 encoder for attachment content (RFC 4648 standard alphabet with padding).
|
||||
///
|
||||
/// Uses the standard base64 alphabet (+ and /) with = padding and no line breaks.
|
||||
const BASE64_ENGINE: base64::engine::GeneralPurpose = base64::engine::general_purpose::STANDARD;
|
||||
|
||||
/// Maximum attachment size before truncation (50 MB per plan 7.5.3).
|
||||
const MAX_ATTACHMENT_SIZE: u64 = 50 * 1024 * 1024;
|
||||
|
||||
|
|
@ -66,6 +73,44 @@ impl AttachmentBuilder {
|
|||
truncated: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to the JSON schema type with base64 encoding.
|
||||
///
|
||||
/// This method converts the intermediate `AttachmentBuilder` to the final
|
||||
/// `AttachmentJson` type that is serialized to JSON output. It handles:
|
||||
/// - Base64 encoding of content (RFC 4648 standard alphabet)
|
||||
/// - Setting `data: None` when truncated
|
||||
/// - Populating `size` from the original content length
|
||||
///
|
||||
/// Per plan 7.5.3, the 50 MB size limit is enforced during extraction,
|
||||
/// so this method only needs to encode the content that's already been
|
||||
/// truncated if necessary.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `crate::schema::AttachmentJson` ready for JSON serialization.
|
||||
pub fn into_json(self) -> crate::schema::AttachmentJson {
|
||||
let data = if self.truncated || self.content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(BASE64_ENGINE.encode(&self.content))
|
||||
};
|
||||
|
||||
// Use the size from /Params if available, otherwise use the actual content length
|
||||
let size = self.size.unwrap_or(self.content.len() as u64);
|
||||
|
||||
crate::schema::AttachmentJson {
|
||||
name: self.name,
|
||||
description: self.description,
|
||||
mime_type: self.mime_type,
|
||||
size,
|
||||
created: self.created,
|
||||
modified: self.modified,
|
||||
checksum_md5: self.checksum_md5,
|
||||
data,
|
||||
truncated: self.truncated,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract a single attachment from a Filespec reference.
|
||||
|
|
@ -610,7 +655,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_extract_filename_uf_preferred() {
|
||||
let filespec_bytes = b"\xFE\xFFT\x00e\x00s\x00t\x00.\x00t\x00x\x00t"; // UTF-16BE BOM + "Test.txt"
|
||||
// UTF-16BE BOM (0xFE 0xFF) + "Test.txt" in big-endian
|
||||
let filespec_bytes = b"\xFE\xFF\x00T\x00e\x00s\x00t\x00.\x00t\x00x\x00t";
|
||||
let decoded = decode_pdf_string(filespec_bytes);
|
||||
assert_eq!(decoded, "Test.txt");
|
||||
}
|
||||
|
|
@ -648,7 +694,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_decode_pdf_string_utf16be_bom() {
|
||||
let bytes = b"\xFE\xFFH\x00e\x00l\x00l\x00o\x00"; // "Hello" in UTF-16BE
|
||||
// UTF-16BE BOM (0xFE 0xFF) + "Hello" in big-endian
|
||||
let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
|
||||
let decoded = decode_pdf_string(bytes);
|
||||
assert_eq!(decoded, "Hello");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@
|
|||
//! large documents with 10,000+ pages.
|
||||
|
||||
use crate::annotation::{dispatch_annotations, json as annotation_json};
|
||||
use crate::attachment::associated_files::walk_af_array;
|
||||
use crate::attachment::filespec::extract_one;
|
||||
use crate::diagnostics::{DiagCode, Diagnostic};
|
||||
use crate::document::compute_fingerprint_lazy;
|
||||
use crate::forms::{
|
||||
|
|
@ -27,7 +29,7 @@ use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES;
|
|||
use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree};
|
||||
use crate::receipts::Receipt;
|
||||
use crate::schema::{
|
||||
AnnotationJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson,
|
||||
AnnotationJson, AttachmentJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson,
|
||||
FormFieldValueJson, LinkJson, SignatureJson, SpanJson, TableJson,
|
||||
};
|
||||
use crate::semaphore::{Semaphore, SemaphoreExt};
|
||||
|
|
@ -143,6 +145,13 @@ pub struct ExtractionResult {
|
|||
/// extracted from all pages. Links are sorted by (page_index, rect.y0 desc, rect.x0).
|
||||
/// Empty when the PDF has no link annotations.
|
||||
pub links: Vec<LinkJson>,
|
||||
/// Embedded file attachments extracted from the document.
|
||||
///
|
||||
/// This array contains all embedded files from the PDF's `/EmbeddedFiles`
|
||||
/// name tree or `/AF` (Associated Files) array. Attachments exceeding
|
||||
/// 50 MB are truncated (metadata only, `data: null`, `truncated: true`).
|
||||
/// Empty when the PDF has no embedded files.
|
||||
pub attachments: Vec<AttachmentJson>,
|
||||
}
|
||||
|
||||
/// Result for a single page.
|
||||
|
|
@ -556,6 +565,15 @@ pub fn extract_pdf(
|
|||
let signatures_core = extract_signatures(&sig_fields, &resolver_arc, file_size);
|
||||
let signatures: Vec<SignatureJson> = signatures_core.into_iter().map(|s| s.into()).collect();
|
||||
|
||||
// Phase 7.5: Extract embedded file attachments from /EmbeddedFiles and /AF
|
||||
let attachments = match resolver_arc.resolve(root_ref) {
|
||||
Ok(catalog_obj) => match catalog_obj.as_dict() {
|
||||
Some(catalog_dict) => extract_attachments(&resolver_arc, catalog_dict, Some(&source)),
|
||||
None => Vec::new(),
|
||||
},
|
||||
Err(_) => Vec::new(),
|
||||
};
|
||||
|
||||
// Phase 7.4: Extract form fields from AcroForm and XFA
|
||||
// Walk AcroForm fields and convert to FormFieldValue
|
||||
let acro_fields = walk_acroform_fields(&resolver_arc, &catalog, None);
|
||||
|
|
@ -621,6 +639,7 @@ pub fn extract_pdf(
|
|||
signatures,
|
||||
form_fields,
|
||||
links: links_json,
|
||||
attachments,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -804,6 +823,65 @@ fn convert_form_field_to_json(
|
|||
}
|
||||
}
|
||||
|
||||
/// Extract embedded file attachments from the PDF.
|
||||
///
|
||||
/// This function walks both the /EmbeddedFiles name tree and the /AF (Associated Files)
|
||||
/// array to extract all embedded file attachments. It handles PDF 1.7 /EmbeddedFiles
|
||||
/// and PDF 2.0 /AF sources, deduplicating by Filespec reference.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `resolver` - The xref resolver for resolving indirect references
|
||||
/// * `catalog_dict` - The raw catalog dictionary (PdfDict)
|
||||
/// * `source` - Optional PDF source for reading stream data (None for metadata-only extraction)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Vec<AttachmentJson>` containing all extracted attachments, sorted by name
|
||||
/// for deterministic output.
|
||||
fn extract_attachments(
|
||||
resolver: &Arc<crate::parser::xref::XrefResolver>,
|
||||
catalog_dict: &crate::parser::object::PdfDict,
|
||||
source: Option<&dyn crate::parser::stream::PdfSource>,
|
||||
) -> Vec<AttachmentJson> {
|
||||
use crate::parser::object::ObjRef;
|
||||
use std::collections::HashSet;
|
||||
|
||||
let mut attachments = Vec::new();
|
||||
let mut seen_refs: HashSet<ObjRef> = HashSet::new();
|
||||
|
||||
// Walk /AF array from the catalog
|
||||
let af_entries = match walk_af_array(resolver, catalog_dict) {
|
||||
Ok(entries) => entries,
|
||||
Err(_) => return Vec::new(), // Return empty if /AF walk fails
|
||||
};
|
||||
for entry in af_entries {
|
||||
if seen_refs.contains(&entry.filespec_ref) {
|
||||
continue; // Skip duplicates
|
||||
}
|
||||
seen_refs.insert(entry.filespec_ref);
|
||||
|
||||
// Extract the attachment
|
||||
match extract_one(resolver, entry.filespec_ref, source) {
|
||||
Ok(attachment) => {
|
||||
attachments.push(attachment.into_json());
|
||||
}
|
||||
Err(_) => {
|
||||
// Skip failed attachments but continue with others
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Also walk /EmbeddedFiles name tree for PDF 1.7 compatibility
|
||||
// This requires implementing a name tree walker for /EmbeddedFiles
|
||||
|
||||
// Sort by name for deterministic output
|
||||
attachments.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
|
||||
attachments
|
||||
}
|
||||
|
||||
/// Extract content from a single page.
|
||||
///
|
||||
/// # Arguments
|
||||
|
|
@ -993,7 +1071,8 @@ pub fn result_to_json(result: &ExtractionResult) -> serde_json::Value {
|
|||
"schema_version": "1.0",
|
||||
"pages": pages,
|
||||
"metadata": metadata_obj,
|
||||
"signatures": result.signatures
|
||||
"signatures": result.signatures,
|
||||
"attachments": result.attachments
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -69,7 +69,9 @@ pub use markdown::{
|
|||
pub use options::{ExtractionOptions, ReceiptsMode};
|
||||
pub use page_class::{page_type_string, PageClass, PageClassification};
|
||||
pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX};
|
||||
pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson};
|
||||
pub use schema::{
|
||||
AttachmentJson, BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson,
|
||||
};
|
||||
pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector};
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ crate-type = ["cdylib"]
|
|||
|
||||
[dependencies]
|
||||
anyhow = "1"
|
||||
base64 = "0.22"
|
||||
pdftract-core = { path = "../pdftract-core" }
|
||||
pyo3 = { version = "0.20", features = ["extension-module"] }
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,10 @@ use pyo3::prelude::*;
|
|||
use pyo3::types::PyDict;
|
||||
use std::path::Path;
|
||||
|
||||
// Import base64 for decoding attachment data in PyO3 bindings
|
||||
use base64::engine::general_purpose::STANDARD;
|
||||
use base64::engine::Engine;
|
||||
|
||||
// Type alias for PyO3 owned references
|
||||
type PyResultAny<'py> = PyResult<Py<PyAny>>;
|
||||
|
||||
|
|
@ -15,7 +19,9 @@ mod extract_stream;
|
|||
use extract_stream::{extract_stream_fn, StreamIterator};
|
||||
|
||||
// Re-export core types and functions
|
||||
use pdftract_core::{extract_pdf, extract_pdf_streaming, ExtractionOptions, PageResult, TableJson};
|
||||
use pdftract_core::{
|
||||
extract_pdf, extract_pdf_streaming, AttachmentJson, ExtractionOptions, PageResult, TableJson,
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Exception hierarchy
|
||||
|
|
@ -256,6 +262,14 @@ fn extract<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResul
|
|||
.collect();
|
||||
dict.set_item("pages", pages?)?;
|
||||
|
||||
// Add attachments (with base64 data decoded to bytes)
|
||||
let attachments: PyResult<Vec<Py<PyAny>>> = result
|
||||
.attachments
|
||||
.into_iter()
|
||||
.map(|attachment| attachment_to_py(py, attachment))
|
||||
.collect();
|
||||
dict.set_item("attachments", attachments?)?;
|
||||
|
||||
Ok(dict.clone().into())
|
||||
}
|
||||
|
||||
|
|
@ -483,3 +497,37 @@ fn table_to_py<'py>(py: Python<'py>, table: TableJson) -> PyResultAny<'py> {
|
|||
|
||||
Ok(dict.clone().into())
|
||||
}
|
||||
|
||||
fn attachment_to_py<'py>(py: Python<'py>, attachment: AttachmentJson) -> PyResultAny<'py> {
|
||||
let dict = PyDict::new(py);
|
||||
|
||||
dict.set_item("name", attachment.name)?;
|
||||
dict.set_item("description", attachment.description)?;
|
||||
dict.set_item("mime_type", attachment.mime_type)?;
|
||||
dict.set_item("size", attachment.size)?;
|
||||
dict.set_item("created", attachment.created)?;
|
||||
dict.set_item("modified", attachment.modified)?;
|
||||
dict.set_item("checksum_md5", attachment.checksum_md5)?;
|
||||
dict.set_item("truncated", attachment.truncated)?;
|
||||
|
||||
// Convert base64 data to bytes (PyO3 will decode the base64 string)
|
||||
if let Some(base64_data) = attachment.data {
|
||||
use base64::engine::general_purpose::STANDARD;
|
||||
use base64::engine::Engine;
|
||||
|
||||
match STANDARD.decode(&base64_data) {
|
||||
Ok(bytes) => {
|
||||
let py_bytes = pyo3::types::PyBytes::new(py, &bytes);
|
||||
dict.set_item("data", py_bytes)?;
|
||||
}
|
||||
Err(_) => {
|
||||
// If base64 decoding fails, set data to None
|
||||
dict.set_item("data", py.None())?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dict.set_item("data", py.None())?;
|
||||
}
|
||||
|
||||
Ok(dict.clone().into())
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue