feat(pdftract-3j2u): implement 50 MB size limit + base64 encoding for attachments

- Add attachments field to ExtractionResult struct
- Implement extract_attachments helper function to walk /AF array
- Add base64 encoding for attachment content in AttachmentBuilder::into_json
- Update result_to_json to include attachments in output
- Add PyO3 bindings for attachments with base64 data decoded to bytes
- Export AttachmentJson from pdftract-core root
- Add base64 dependency to pdftract-core and pdftract-py

Per plan 7.5.3:
- Attachments > 50 MB are truncated (metadata only, data: null, truncated: true)
- Base64 encoding uses RFC 4648 standard alphabet with padding
- CLI --text mode excludes attachments (existing behavior maintained)
- JSON sink includes attachments array

Closes: pdftract-3j2u

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-25 11:42:28 -04:00
parent 92b0643331
commit bf9a19f652
6 changed files with 184 additions and 6 deletions

View file

@ -9,6 +9,7 @@ publish = true
[dependencies]
anyhow = { workspace = true }
base64 = { workspace = true }
hex = "0.4"
image = { version = "0.25", optional = true }
url = { version = "2.5", optional = true }

View file

@ -19,6 +19,13 @@ use crate::parser::object::ObjRef;
use crate::parser::stream::{ExtractionOptions, PdfSource, DEFAULT_MAX_DECOMPRESS_BYTES};
use crate::parser::xref::XrefResolver;
use base64::engine::Engine;
/// Base64 encoder for attachment content (RFC 4648 standard alphabet with padding).
///
/// Uses the standard base64 alphabet (+ and /) with = padding and no line breaks.
const BASE64_ENGINE: base64::engine::GeneralPurpose = base64::engine::general_purpose::STANDARD;
/// Maximum attachment size before truncation (50 MB per plan 7.5.3).
const MAX_ATTACHMENT_SIZE: u64 = 50 * 1024 * 1024;
@ -66,6 +73,44 @@ impl AttachmentBuilder {
truncated: false,
}
}
/// Convert to the JSON schema type with base64 encoding.
///
/// This method converts the intermediate `AttachmentBuilder` to the final
/// `AttachmentJson` type that is serialized to JSON output. It handles:
/// - Base64 encoding of content (RFC 4648 standard alphabet)
/// - Setting `data: None` when truncated
/// - Populating `size` from the original content length
///
/// Per plan 7.5.3, the 50 MB size limit is enforced during extraction,
/// so this method only needs to encode the content that's already been
/// truncated if necessary.
///
/// # Returns
///
/// A `crate::schema::AttachmentJson` ready for JSON serialization.
pub fn into_json(self) -> crate::schema::AttachmentJson {
let data = if self.truncated || self.content.is_empty() {
None
} else {
Some(BASE64_ENGINE.encode(&self.content))
};
// Use the size from /Params if available, otherwise use the actual content length
let size = self.size.unwrap_or(self.content.len() as u64);
crate::schema::AttachmentJson {
name: self.name,
description: self.description,
mime_type: self.mime_type,
size,
created: self.created,
modified: self.modified,
checksum_md5: self.checksum_md5,
data,
truncated: self.truncated,
}
}
}
/// Extract a single attachment from a Filespec reference.
@ -610,7 +655,8 @@ mod tests {
#[test]
fn test_extract_filename_uf_preferred() {
let filespec_bytes = b"\xFE\xFFT\x00e\x00s\x00t\x00.\x00t\x00x\x00t"; // UTF-16BE BOM + "Test.txt"
// UTF-16BE BOM (0xFE 0xFF) + "Test.txt" in big-endian
let filespec_bytes = b"\xFE\xFF\x00T\x00e\x00s\x00t\x00.\x00t\x00x\x00t";
let decoded = decode_pdf_string(filespec_bytes);
assert_eq!(decoded, "Test.txt");
}
@ -648,7 +694,8 @@ mod tests {
#[test]
fn test_decode_pdf_string_utf16be_bom() {
let bytes = b"\xFE\xFFH\x00e\x00l\x00l\x00o\x00"; // "Hello" in UTF-16BE
// UTF-16BE BOM (0xFE 0xFF) + "Hello" in big-endian
let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
let decoded = decode_pdf_string(bytes);
assert_eq!(decoded, "Hello");
}

View file

@ -14,6 +14,8 @@
//! large documents with 10,000+ pages.
use crate::annotation::{dispatch_annotations, json as annotation_json};
use crate::attachment::associated_files::walk_af_array;
use crate::attachment::filespec::extract_one;
use crate::diagnostics::{DiagCode, Diagnostic};
use crate::document::compute_fingerprint_lazy;
use crate::forms::{
@ -27,7 +29,7 @@ use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES;
use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree};
use crate::receipts::Receipt;
use crate::schema::{
AnnotationJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson,
AnnotationJson, AttachmentJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson,
FormFieldValueJson, LinkJson, SignatureJson, SpanJson, TableJson,
};
use crate::semaphore::{Semaphore, SemaphoreExt};
@ -143,6 +145,13 @@ pub struct ExtractionResult {
/// extracted from all pages. Links are sorted by (page_index, rect.y0 desc, rect.x0).
/// Empty when the PDF has no link annotations.
pub links: Vec<LinkJson>,
/// Embedded file attachments extracted from the document.
///
/// This array contains all embedded files from the PDF's `/EmbeddedFiles`
/// name tree or `/AF` (Associated Files) array. Attachments exceeding
/// 50 MB are truncated (metadata only, `data: null`, `truncated: true`).
/// Empty when the PDF has no embedded files.
pub attachments: Vec<AttachmentJson>,
}
/// Result for a single page.
@ -556,6 +565,15 @@ pub fn extract_pdf(
let signatures_core = extract_signatures(&sig_fields, &resolver_arc, file_size);
let signatures: Vec<SignatureJson> = signatures_core.into_iter().map(|s| s.into()).collect();
// Phase 7.5: Extract embedded file attachments from /EmbeddedFiles and /AF
let attachments = match resolver_arc.resolve(root_ref) {
Ok(catalog_obj) => match catalog_obj.as_dict() {
Some(catalog_dict) => extract_attachments(&resolver_arc, catalog_dict, Some(&source)),
None => Vec::new(),
},
Err(_) => Vec::new(),
};
// Phase 7.4: Extract form fields from AcroForm and XFA
// Walk AcroForm fields and convert to FormFieldValue
let acro_fields = walk_acroform_fields(&resolver_arc, &catalog, None);
@ -621,6 +639,7 @@ pub fn extract_pdf(
signatures,
form_fields,
links: links_json,
attachments,
})
}
@ -804,6 +823,65 @@ fn convert_form_field_to_json(
}
}
/// Extract embedded file attachments from the PDF.
///
/// This function walks both the /EmbeddedFiles name tree and the /AF (Associated Files)
/// array to extract all embedded file attachments. It handles PDF 1.7 /EmbeddedFiles
/// and PDF 2.0 /AF sources, deduplicating by Filespec reference.
///
/// # Arguments
///
/// * `resolver` - The xref resolver for resolving indirect references
/// * `catalog_dict` - The raw catalog dictionary (PdfDict)
/// * `source` - Optional PDF source for reading stream data (None for metadata-only extraction)
///
/// # Returns
///
/// A `Vec<AttachmentJson>` containing all extracted attachments, sorted by name
/// for deterministic output.
fn extract_attachments(
resolver: &Arc<crate::parser::xref::XrefResolver>,
catalog_dict: &crate::parser::object::PdfDict,
source: Option<&dyn crate::parser::stream::PdfSource>,
) -> Vec<AttachmentJson> {
use crate::parser::object::ObjRef;
use std::collections::HashSet;
let mut attachments = Vec::new();
let mut seen_refs: HashSet<ObjRef> = HashSet::new();
// Walk /AF array from the catalog
let af_entries = match walk_af_array(resolver, catalog_dict) {
Ok(entries) => entries,
Err(_) => return Vec::new(), // Return empty if /AF walk fails
};
for entry in af_entries {
if seen_refs.contains(&entry.filespec_ref) {
continue; // Skip duplicates
}
seen_refs.insert(entry.filespec_ref);
// Extract the attachment
match extract_one(resolver, entry.filespec_ref, source) {
Ok(attachment) => {
attachments.push(attachment.into_json());
}
Err(_) => {
// Skip failed attachments but continue with others
continue;
}
}
}
// TODO: Also walk /EmbeddedFiles name tree for PDF 1.7 compatibility
// This requires implementing a name tree walker for /EmbeddedFiles
// Sort by name for deterministic output
attachments.sort_by(|a, b| a.name.cmp(&b.name));
attachments
}
/// Extract content from a single page.
///
/// # Arguments
@ -993,7 +1071,8 @@ pub fn result_to_json(result: &ExtractionResult) -> serde_json::Value {
"schema_version": "1.0",
"pages": pages,
"metadata": metadata_obj,
"signatures": result.signatures
"signatures": result.signatures,
"attachments": result.attachments
})
}

View file

@ -69,7 +69,9 @@ pub use markdown::{
pub use options::{ExtractionOptions, ReceiptsMode};
pub use page_class::{page_type_string, PageClass, PageClassification};
pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX};
pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson};
pub use schema::{
AttachmentJson, BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson,
};
pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector};
#[cfg(feature = "ocr")]

View file

@ -12,6 +12,7 @@ crate-type = ["cdylib"]
[dependencies]
anyhow = "1"
base64 = "0.22"
pdftract-core = { path = "../pdftract-core" }
pyo3 = { version = "0.20", features = ["extension-module"] }

View file

@ -7,6 +7,10 @@ use pyo3::prelude::*;
use pyo3::types::PyDict;
use std::path::Path;
// Import base64 for decoding attachment data in PyO3 bindings
use base64::engine::general_purpose::STANDARD;
use base64::engine::Engine;
// Type alias for PyO3 owned references
type PyResultAny<'py> = PyResult<Py<PyAny>>;
@ -15,7 +19,9 @@ mod extract_stream;
use extract_stream::{extract_stream_fn, StreamIterator};
// Re-export core types and functions
use pdftract_core::{extract_pdf, extract_pdf_streaming, ExtractionOptions, PageResult, TableJson};
use pdftract_core::{
extract_pdf, extract_pdf_streaming, AttachmentJson, ExtractionOptions, PageResult, TableJson,
};
// ============================================================================
// Exception hierarchy
@ -256,6 +262,14 @@ fn extract<'py>(py: Python<'py>, path: &str, kwargs: Option<&PyDict>) -> PyResul
.collect();
dict.set_item("pages", pages?)?;
// Add attachments (with base64 data decoded to bytes)
let attachments: PyResult<Vec<Py<PyAny>>> = result
.attachments
.into_iter()
.map(|attachment| attachment_to_py(py, attachment))
.collect();
dict.set_item("attachments", attachments?)?;
Ok(dict.clone().into())
}
@ -483,3 +497,37 @@ fn table_to_py<'py>(py: Python<'py>, table: TableJson) -> PyResultAny<'py> {
Ok(dict.clone().into())
}
fn attachment_to_py<'py>(py: Python<'py>, attachment: AttachmentJson) -> PyResultAny<'py> {
let dict = PyDict::new(py);
dict.set_item("name", attachment.name)?;
dict.set_item("description", attachment.description)?;
dict.set_item("mime_type", attachment.mime_type)?;
dict.set_item("size", attachment.size)?;
dict.set_item("created", attachment.created)?;
dict.set_item("modified", attachment.modified)?;
dict.set_item("checksum_md5", attachment.checksum_md5)?;
dict.set_item("truncated", attachment.truncated)?;
// Convert base64 data to bytes (PyO3 will decode the base64 string)
if let Some(base64_data) = attachment.data {
use base64::engine::general_purpose::STANDARD;
use base64::engine::Engine;
match STANDARD.decode(&base64_data) {
Ok(bytes) => {
let py_bytes = pyo3::types::PyBytes::new(py, &bytes);
dict.set_item("data", py_bytes)?;
}
Err(_) => {
// If base64 decoding fails, set data to None
dict.set_item("data", py.None())?;
}
}
} else {
dict.set_item("data", py.None())?;
}
Ok(dict.clone().into())
}