Collects in-progress work across forms (Ch/Tx field handling, value_text edge cases), layout corrections, stream parser fixes, conformance test expansion, security audit test (TH-08), stream-decoder bomb fixture, debug examples reorganization under examples/debug/, sdk module scaffold, xtask CLI enhancements, and provenance entries for new fixtures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
361 lines
11 KiB
Rust
361 lines
11 KiB
Rust
//! Document model integration tests.
|
|
//!
|
|
//! This test module loads curated PDF fixtures and verifies that the document
|
|
//! model correctly extracts and resolves all document-level information.
|
|
|
|
use pdftract_core::detection::{detect_javascript, detect_xfa};
|
|
use pdftract_core::document::parse_pdf_file;
|
|
use pdftract_core::parser::catalog::Catalog;
|
|
use pdftract_core::parser::pages::PageDict;
|
|
use pdftract_core::parser::xref::XrefResolver;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::path::Path;
|
|
|
|
/// Golden file structure for document model verification.
|
|
///
|
|
/// This captures all the document-level information that should be
|
|
/// extracted and resolved by the document model integration.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct DocumentModelGolden {
|
|
/// Number of pages in the document
|
|
page_count: usize,
|
|
/// Encryption information (if applicable)
|
|
encryption: Option<EncryptionInfo>,
|
|
/// Optional content groups visibility (if present)
|
|
ocg_visibility: Option<OcgVisibility>,
|
|
/// Outline/bookmarks structure (if present)
|
|
outlines: Option<OutlineNode>,
|
|
/// JavaScript detection result
|
|
contains_javascript: bool,
|
|
/// XFA form detection result
|
|
contains_xfa: bool,
|
|
/// Page labels (if present)
|
|
page_labels: Option<Vec<String>>,
|
|
/// PDF/A conformance (if present in XMP metadata)
|
|
pdfa_conformance: Option<String>,
|
|
/// Diagnostics emitted during parsing
|
|
diagnostics: Vec<DiagnosticInfo>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct EncryptionInfo {
|
|
is_encrypted: bool,
|
|
handler: Option<String>,
|
|
status: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct OcgVisibility {
|
|
default_state: String,
|
|
groups: Vec<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct OutlineNode {
|
|
title: String,
|
|
dest_page: Option<usize>,
|
|
children: Vec<OutlineNode>,
|
|
is_expanded: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct DiagnosticInfo {
|
|
code: String,
|
|
message: String,
|
|
}
|
|
|
|
/// Load a fixture PDF and extract its document model.
|
|
fn load_fixture(fixture_path: &Path) -> Result<DocumentModelGolden, Box<dyn std::error::Error>> {
|
|
// Parse the PDF
|
|
let (_fingerprint, catalog, pages, resolver) = parse_pdf_file(fixture_path)?;
|
|
|
|
// Check encryption status
|
|
let encryption_info = check_encryption(&resolver);
|
|
|
|
// Extract OCG visibility
|
|
let ocg_visibility = extract_ocg_visibility(&catalog);
|
|
|
|
// Extract outlines (pass pages for destination resolution)
|
|
let outlines = extract_outlines_with_pages(&catalog, &resolver, &pages);
|
|
|
|
// Detect JavaScript and XFA
|
|
let acroform = catalog.acroform_ref
|
|
.and_then(|r| resolver.resolve(r).ok())
|
|
.and_then(|o| o.as_dict().cloned());
|
|
let contains_javascript = detect_javascript(&catalog, &pages, &acroform, &resolver);
|
|
let contains_xfa = detect_xfa(&acroform);
|
|
|
|
// Extract page labels
|
|
let page_labels = extract_page_labels(&catalog, pages.len());
|
|
|
|
// Extract PDF/A conformance
|
|
let pdfa_conformance = extract_pdfa_conformance(&catalog, &resolver);
|
|
|
|
// Collect diagnostics
|
|
let diagnostics = collect_diagnostics(&catalog);
|
|
|
|
Ok(DocumentModelGolden {
|
|
page_count: pages.len(),
|
|
encryption: encryption_info,
|
|
ocg_visibility,
|
|
outlines,
|
|
contains_javascript,
|
|
contains_xfa,
|
|
page_labels,
|
|
pdfa_conformance,
|
|
diagnostics,
|
|
})
|
|
}
|
|
|
|
/// Extract outline/bookmarks structure with pages for destination resolution.
|
|
fn extract_outlines_with_pages(
|
|
catalog: &Catalog,
|
|
resolver: &XrefResolver,
|
|
pages: &[pdftract_core::parser::pages::PageDict],
|
|
) -> Option<OutlineNode> {
|
|
let outlines_ref = catalog.outlines_ref?;
|
|
let (outlines, _diagnostics) = pdftract_core::parser::outline::parse_outlines(
|
|
resolver,
|
|
Some(outlines_ref),
|
|
pages,
|
|
);
|
|
|
|
if outlines.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
// Convert the first outline to our test structure
|
|
// For now, just return the first outline at the root level
|
|
Some(convert_outline_to_test_node(&outlines[0]))
|
|
}
|
|
|
|
/// Convert an Outline to our test's OutlineNode structure.
|
|
fn convert_outline_to_test_node(outline: &pdftract_core::parser::outline::Outline) -> OutlineNode {
|
|
OutlineNode {
|
|
title: outline.title.clone(),
|
|
dest_page: outline.dest_page.map(|p| p as usize),
|
|
children: outline.children.iter().map(convert_outline_to_test_node).collect(),
|
|
is_expanded: outline.count > 0,
|
|
}
|
|
}
|
|
|
|
/// Check if the document is encrypted.
|
|
///
|
|
/// This function attempts to detect encryption by parsing the trailer's
|
|
/// /Encrypt dictionary. Returns None for unencrypted documents.
|
|
fn check_encryption(resolver: &XrefResolver) -> Option<EncryptionInfo> {
|
|
// Access the trailer from the resolver
|
|
let trailer = &resolver.xref_section.trailer?;
|
|
|
|
// Use the encryption detection module
|
|
let mut diagnostics = Vec::new();
|
|
let info = pdftract_core::encryption::detection::detect_encryption(
|
|
trailer,
|
|
resolver,
|
|
&mut diagnostics,
|
|
);
|
|
|
|
// Map encryption::detection::EncryptionInfo to our test's EncryptionInfo
|
|
info.map(|enc| EncryptionInfo {
|
|
is_encrypted: true,
|
|
handler: Some(format!("V={} R={}", enc.version, enc.revision)),
|
|
status: format!("{}-bit", enc.key_length),
|
|
})
|
|
}
|
|
|
|
/// Extract OCG visibility information.
|
|
fn extract_ocg_visibility(catalog: &Catalog) -> Option<OcgVisibility> {
|
|
let oc_props = catalog.oc_properties.as_ref()?;
|
|
let default_state = match oc_props.default_state {
|
|
pdftract_core::parser::ocg::BaseState::On => "ON".to_string(),
|
|
pdftract_core::parser::ocg::BaseState::Off => "OFF".to_string(),
|
|
pdftract_core::parser::ocg::BaseState::Unchanged => "UNCHANGED".to_string(),
|
|
};
|
|
|
|
let groups: Vec<String> = oc_props.optional_content
|
|
.iter()
|
|
.map(|ocg| ocg.name.clone().unwrap_or_else(|| "Unnamed".to_string()))
|
|
.collect();
|
|
|
|
Some(OcgVisibility {
|
|
default_state,
|
|
groups,
|
|
})
|
|
}
|
|
|
|
/// Extract outline/bookmarks structure.
|
|
fn extract_outlines(catalog: &Catalog, resolver: &XrefResolver) -> Option<OutlineNode> {
|
|
let outlines_ref = catalog.outlines_ref?;
|
|
// Note: parse_outlines needs the pages array, but we only have the resolver here.
|
|
// For now, return None - this would require refactoring load_fixture to pass pages.
|
|
None
|
|
}
|
|
|
|
/// Extract page labels for all pages.
|
|
fn extract_page_labels(catalog: &Catalog, page_count: usize) -> Option<Vec<String>> {
|
|
let labels_tree = catalog.page_labels.as_ref()?;
|
|
let mut labels = Vec::new();
|
|
for i in 0..page_count as i64 {
|
|
let label = labels_tree.get_label(i)?;
|
|
let start = labels_tree.get_label_with_start(i)?.1;
|
|
labels.push(label.format_absolute(i, start));
|
|
}
|
|
Some(labels)
|
|
}
|
|
|
|
/// Extract PDF/A conformance from XMP metadata.
|
|
fn extract_pdfa_conformance(catalog: &Catalog, resolver: &XrefResolver) -> Option<String> {
|
|
let metadata_ref = catalog.metadata_ref?;
|
|
let metadata_obj = resolver.resolve(metadata_ref).ok()?;
|
|
let metadata_dict = metadata_obj.as_dict()?;
|
|
let stream = metadata_dict.get("")?.as_stream()?;
|
|
let metadata_bytes = stream.decoded_data.ok()?;
|
|
let metadata_str = std::string::String::from_utf8(metadata_bytes).ok()?;
|
|
|
|
// Simple check for PDF/A identifiers
|
|
if metadata_str.contains("pdfaid:part") && metadata_str.contains("pdfaid:conformance") {
|
|
// Extract part and conformance
|
|
let part = metadata_str
|
|
.split("pdfaid:part")
|
|
.nth(1)?
|
|
.split('>')
|
|
.nth(1)?
|
|
.split('<')
|
|
.next()?;
|
|
let conformance = metadata_str
|
|
.split("pdfaid:conformance")
|
|
.nth(1)?
|
|
.split('>')
|
|
.nth(1)?
|
|
.split('<')
|
|
.next()?;
|
|
Some(format!("PDF/A-{}{}", part.trim(), conformance.trim()))
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Collect diagnostics emitted during parsing.
|
|
fn collect_diagnostics(catalog: &Catalog) -> Vec<DiagnosticInfo> {
|
|
catalog
|
|
.diagnostics
|
|
.iter()
|
|
.map(|d| DiagnosticInfo {
|
|
code: d.code.to_string(),
|
|
message: d.message.clone(),
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod integration_tests {
|
|
use super::*;
|
|
use std::fs;
|
|
|
|
fn run_fixture_test(fixture_name: &str) {
|
|
let fixture_path = Path::new("tests/document_model/fixtures")
|
|
.join(fixture_name)
|
|
.with_extension("pdf");
|
|
let expected_path = Path::new("tests/document_model/fixtures")
|
|
.join(fixture_name)
|
|
.with_extension("expected.json");
|
|
|
|
// Load the fixture
|
|
let actual = load_fixture(&fixture_path)
|
|
.unwrap_or_else(|e| panic!("Failed to load fixture {:?}: {}", fixture_path, e));
|
|
|
|
// Load or create the expected golden file
|
|
let expected: DocumentModelGolden = if expected_path.exists() {
|
|
serde_json::from_str(&fs::read_to_string(&expected_path).unwrap())
|
|
.unwrap_or_else(|e| panic!("Failed to parse golden file {:?}: {}", expected_path, e))
|
|
} else {
|
|
// Create golden file if it doesn't exist
|
|
let golden_json = serde_json::to_string_pretty(&actual).unwrap();
|
|
fs::write(&expected_path, golden_json).unwrap();
|
|
eprintln!("Created golden file: {:?}", expected_path);
|
|
return; // Skip test assertion for newly created golden
|
|
};
|
|
|
|
// Compare with golden
|
|
assert_eq!(
|
|
actual, expected,
|
|
"Fixture {} does not match golden file",
|
|
fixture_name
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_encrypted_rc4() {
|
|
run_fixture_test("encrypted_rc4_test");
|
|
}
|
|
|
|
#[test]
|
|
fn test_encrypted_aes128() {
|
|
run_fixture_test("encrypted_aes128_test");
|
|
}
|
|
|
|
#[test]
|
|
fn test_encrypted_aes256() {
|
|
run_fixture_test("encrypted_aes256_test");
|
|
}
|
|
|
|
#[test]
|
|
fn test_encrypted_empty_password() {
|
|
run_fixture_test("encrypted_empty_password");
|
|
}
|
|
|
|
#[test]
|
|
fn test_tagged_3_level_outline() {
|
|
run_fixture_test("tagged_3_level_outline");
|
|
}
|
|
|
|
#[test]
|
|
fn test_ocg_default_off() {
|
|
run_fixture_test("ocg_default_off");
|
|
}
|
|
|
|
#[test]
|
|
fn test_multi_revision_3() {
|
|
run_fixture_test("multi_revision_3");
|
|
}
|
|
|
|
#[test]
|
|
fn test_inheritance_grandparent_mediabox() {
|
|
run_fixture_test("inheritance_grandparent_mediabox");
|
|
}
|
|
|
|
#[test]
|
|
fn test_missing_mediabox() {
|
|
run_fixture_test("missing_mediabox");
|
|
}
|
|
|
|
#[test]
|
|
fn test_partial_resource_override() {
|
|
run_fixture_test("partial_resource_override");
|
|
}
|
|
|
|
#[test]
|
|
fn test_js_in_openaction() {
|
|
run_fixture_test("js_in_openaction");
|
|
}
|
|
|
|
#[test]
|
|
fn test_xfa_form() {
|
|
run_fixture_test("xfa_form");
|
|
}
|
|
|
|
#[test]
|
|
fn test_pdfa_1b_conformance() {
|
|
run_fixture_test("pdfa_1b_conformance");
|
|
}
|
|
|
|
#[test]
|
|
fn test_page_labels_roman_arabic() {
|
|
run_fixture_test("page_labels_roman_arabic");
|
|
}
|
|
|
|
#[test]
|
|
fn test_encrypted_unknown_handler() {
|
|
run_fixture_test("encrypted_unknown_handler");
|
|
}
|
|
}
|