fix(pdftract-2uk9z): wrap native module results in typed Python objects
The native PyO3 module returns raw dicts via pythonize, but the Python SDK API expects typed dataclass objects (Document, Page, Metadata, etc.) to be consistent with the subprocess fallback and test expectations. Updated wrapper functions in __init__.py to convert native results: - extract(): wraps dict in Document.from_dict() - extract_stream(): wraps yielded page dicts in Page.from_dict() - get_metadata(): wraps dict in Metadata() - hash(): wraps string in Fingerprint.from_string() - classify(): wraps dict in Classification() - search(): wraps yielded match dicts in Match The native PyO3 entry points (extract, extract_text, extract_stream) were already implemented with: - extract: uses extract_pdf + pythonize for PyDict conversion - extract_text: uses extract_text for plain String return - extract_stream: uses extract_pdf_streaming with custom StreamIterator All kwargs parsing with strict validation (unknown kwargs raise TypeError) was already in place. Acceptance criteria: - pdftract.extract() returns Document object with pages/metadata - pdftract.extract_text() returns plain text string - pdftract.extract_stream() yields Page objects - Unknown kwarg raises TypeError
This commit is contained in:
parent
8d06ad24ae
commit
bb7146cffe
66 changed files with 2393 additions and 800 deletions
|
|
@ -1 +1 @@
|
|||
4fa4fff8e55978ae5302f6cc8ef703b049b4ebf7
|
||||
9347bde9a25babd419ddc6c5759e17cec4319a76
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@
|
|||
use anyhow::Result;
|
||||
use pdftract_core::{extract_pdf_ndjson, ExtractionOptions};
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter};
|
||||
use std::path::Path;
|
||||
|
||||
|
|
|
|||
2
crates/pdftract-core/src/cache/mod.rs
vendored
2
crates/pdftract-core/src/cache/mod.rs
vendored
|
|
@ -23,7 +23,7 @@
|
|||
//! - [`key`] — Cache key construction from (fingerprint, options) pairs
|
||||
//! - [`compression`] — Zstandard compression/decompression for cache entries
|
||||
//! - [`integrity`] — HMAC-SHA-256 integrity verification (TH-10 mitigation)
|
||||
//! - [`metadata`] — Cache index.json and metadata handling (TODO: 6.9.3)
|
||||
//! - `metadata` — Cache index.json and metadata handling (TODO: 6.9.3)
|
||||
|
||||
pub mod compression;
|
||||
pub mod integrity;
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@
|
|||
//!
|
||||
//! # Mapping (INV-9)
|
||||
//!
|
||||
//! The mapping from internal [`UnicodeSource`](crate::font::UnicodeSource)
|
||||
//! (6 variants) to [`ConfidenceSource`] (3 variants) is:
|
||||
//! The mapping from internal [`UnicodeSource`] (6 variants) to [`ConfidenceSource`]
|
||||
//! (3 variants) is:
|
||||
//!
|
||||
//! | `UnicodeSource` | `corrected_in_4_7` | `ConfidenceSource` |
|
||||
//! |-----------------|-------------------|-------------------|
|
||||
|
|
|
|||
|
|
@ -351,6 +351,43 @@ pub fn compute_pdf_fingerprint(pdf_path: &std::path::Path) -> Result<String> {
|
|||
/// // Process page without holding all pages in memory
|
||||
/// }
|
||||
/// ```
|
||||
/// PDF document extractor with lazy page iteration.
|
||||
///
|
||||
/// This struct provides on-demand access to PDF pages without materializing
|
||||
/// the entire page tree in memory. Use it for memory-efficient extraction
|
||||
/// from large documents or when you need random access to specific pages.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Open a PDF and iterate over pages lazily:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::document::PdfExtractor;
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let extractor = PdfExtractor::open("document.pdf")?;
|
||||
/// println!("Fingerprint: {}", extractor.fingerprint());
|
||||
/// println!("Total pages: {}", extractor.catalog().page_count.unwrap_or(0));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Memory-bounded extraction of specific pages:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::document::PdfExtractor;
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let extractor = PdfExtractor::open("large.pdf")?;
|
||||
///
|
||||
/// // Only pages 5-10 are materialized, not the entire document
|
||||
/// for page_result in extractor.pages()?.take(10) {
|
||||
/// let page = page_result?;
|
||||
/// println!("Page {} has {} spans", page.index, page.spans.len());
|
||||
/// }
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct PdfExtractor {
|
||||
/// The PDF file source
|
||||
source: FileSource,
|
||||
|
|
@ -855,6 +892,26 @@ impl Document {
|
|||
/// and materializes only the current path from root to leaf (max ~16 nodes).
|
||||
/// Each yielded PageExtraction contains the extracted data for one page,
|
||||
/// and all intermediate data is dropped before yielding the next page.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Iterate over pages with bounded memory:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::document::Document;
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let doc = Document::open("large_document.pdf")?;
|
||||
///
|
||||
/// // Memory stays O(depth × per-page), not O(pages × per-page)
|
||||
/// for page_result in doc.pages() {
|
||||
/// let page = page_result?;
|
||||
/// println!("Page {}: {}x{}", page.index, page.width, page.height);
|
||||
/// // PageExtraction is dropped after each iteration
|
||||
/// }
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct PageIter<'a> {
|
||||
/// Lazy page iterator from the parser
|
||||
lazy_iter: Option<LazyPageIter<'a>>,
|
||||
|
|
@ -975,7 +1032,7 @@ pub fn open_remote_url(url: &str) -> std::io::Result<Box<dyn PdfSource>> {
|
|||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A Box<dyn PdfSource> that can be used for PDF parsing.
|
||||
/// A `Box<dyn PdfSource>` that can be used for PDF parsing.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ pub struct EncryptionInfo {
|
|||
pub user_hash: Vec<u8>,
|
||||
/// Permissions flags (/P for V<5, /Perms for V=5)
|
||||
pub perms: u32,
|
||||
/// File ID (first 16 bytes of /ID[0] from trailer)
|
||||
/// File ID (first 16 bytes of /ID\[0\] from trailer)
|
||||
pub file_id: Vec<u8>,
|
||||
/// Crypt filter dictionary for V=4 and V=5
|
||||
pub crypt_filters: Option<CryptFiltersV4>,
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
//!
|
||||
//! The file encryption key is derived from:
|
||||
//! 1. Pad password to 32 bytes via the standard padding string
|
||||
//! 2. MD5 hash: pad || /O || /P (4 bytes LE) || first16(/ID[0])
|
||||
//! 2. MD5 hash: pad || /O || /P (4 bytes LE) || first16(/ID\[0\])
|
||||
//! 3. If R>=3: iterate MD5 50 times on the first n bytes (n = key_length/8)
|
||||
//! 4. The first n bytes of the MD5 output is the encryption key
|
||||
//!
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
//!
|
||||
//! - R=2: pad password; RC4-encrypt the 32-byte padding string with the file key;
|
||||
//! compare with /U
|
||||
//! - R=3: pad password; MD5(pad || first16(/ID[0])); RC4 19 times with i^step key;
|
||||
//! - R=3: pad password; MD5(pad || first16(/ID\[0\])); RC4 19 times with i^step key;
|
||||
//! compare first 16 bytes with first 16 of /U
|
||||
|
||||
#[cfg(feature = "decrypt")]
|
||||
|
|
|
|||
|
|
@ -373,6 +373,91 @@ pub struct ExtractionMetadata {
|
|||
/// - The PDF structure is invalid or corrupted
|
||||
/// - Decryption fails (for encrypted PDFs)
|
||||
/// - Content stream decoding exceeds bomb limits
|
||||
/// Extract text, tables, and metadata from a PDF file.
|
||||
///
|
||||
/// This is the main entry point for PDF extraction. It processes the entire
|
||||
/// document and returns structured data including text spans, blocks, tables,
|
||||
/// form fields, links, and more.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `pdf_path` - Path to the PDF file to extract from
|
||||
/// * `options` - Extraction options controlling OCR, DPI, page limits, etc.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A [`ExtractionResult`] containing:
|
||||
/// - `fingerprint` - Cryptographic hash of the PDF for receipt verification
|
||||
/// - `pages` - Array of extracted pages with spans, blocks, and tables
|
||||
/// - `signatures` - Digital signature information
|
||||
/// - `form_fields` - Interactive form field values
|
||||
/// - `links` - Hyperlinks and internal destinations
|
||||
/// - `attachments` - Embedded file attachments
|
||||
/// - `threads` - Article thread chains
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - The PDF file cannot be opened or read
|
||||
/// - The PDF is malformed or corrupted
|
||||
/// - The PDF is encrypted and no password is provided
|
||||
/// - Decompression bomb limits are exceeded
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic extraction with default options:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let result = extract_pdf(
|
||||
/// "document.pdf",
|
||||
/// &ExtractionOptions::default()
|
||||
/// )?;
|
||||
///
|
||||
/// println!("Extracted {} pages", result.pages.len());
|
||||
/// println!("Fingerprint: {}", result.fingerprint);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Extraction with OCR for scanned documents:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// # #[cfg(feature = "ocr")]
|
||||
/// let result = extract_pdf(
|
||||
/// "scanned.pdf",
|
||||
/// &ExtractionOptions {
|
||||
/// ocr_languages: vec!["eng".to_string()],
|
||||
/// ..Default::default()
|
||||
/// }
|
||||
/// )?;
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Extraction with page limit for large files:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||||
///
|
||||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// let result = extract_pdf(
|
||||
/// "large_document.pdf",
|
||||
/// &ExtractionOptions {
|
||||
/// max_pages: Some(10),
|
||||
/// ..Default::default()
|
||||
/// }
|
||||
/// )?;
|
||||
///
|
||||
/// println!("First 10 pages extracted");
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn extract_pdf(
|
||||
pdf_path: &std::path::Path,
|
||||
options: &ExtractionOptions,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
//!
|
||||
//! # References
|
||||
//!
|
||||
//! - Adobe Glyph List Specification: https://github.com/adobe-type-tools/agl-aglfn
|
||||
//! - Adobe Glyph List Specification: <https://github.com/adobe-type-tools/agl-aglfn>
|
||||
//! - AGL 1.4 (glyphlist.txt): ~4,400 entries
|
||||
//! - AGLFN 1.7 (aglfn.txt): ~770 entries for new fonts
|
||||
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ impl DifferencesOverlay {
|
|||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// ```text
|
||||
/// // [ 39 /quotesingle 96 /grave ]
|
||||
/// // → entries: [(39, "quotesingle"), (96, "grave")]
|
||||
/// ```
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
//!
|
||||
//! 1. Convert 32×32 grayscale bitmap to float32 values
|
||||
//! 2. Apply 32×32 2D DCT-II (Discrete Cosine Transform)
|
||||
//! 3. Extract top-left 8×8 AC coefficients (skipping DC at [0,0])
|
||||
//! 3. Extract top-left 8×8 AC coefficients (skipping DC at \[0,0\])
|
||||
//! 4. Compute median of those 64 values
|
||||
//! 5. Produce 64-bit hash: bit i is set if coefficient i > median
|
||||
//!
|
||||
|
|
|
|||
|
|
@ -596,9 +596,9 @@ impl GraphicsState {
|
|||
/// Set fill color in current color space (sc operator).
|
||||
///
|
||||
/// The numeric components are interpreted based on the current fill_color_space.
|
||||
/// For DeviceGray: [gray]
|
||||
/// For DeviceRGB: [r, g, b]
|
||||
/// For DeviceCMYK: [c, m, y, k]
|
||||
/// For DeviceGray: \[gray\]
|
||||
/// For DeviceRGB: \[r, g, b\]
|
||||
/// For DeviceCMYK: \[c, m, y, k\]
|
||||
/// For other spaces: sets Color::Other
|
||||
#[inline]
|
||||
pub fn set_fill_color(&mut self, components: &[f32]) {
|
||||
|
|
|
|||
|
|
@ -22,8 +22,8 @@ use tracing::warn;
|
|||
///
|
||||
/// # Behavior
|
||||
///
|
||||
/// - For each span: `idx = span.bbox[0].round() as usize`
|
||||
/// - Clamp idx to `[0, hist.len() - 1]`
|
||||
/// - For each span: `idx = span.bbox\[0\].round() as usize`
|
||||
/// - Clamp idx to `\[0, hist.len() - 1\]`
|
||||
/// - x0 < 0: clamped to 0, diagnostic logged
|
||||
/// - x0 > page_width: clamped to last bucket, diagnostic logged
|
||||
/// - Empty spans: returns Vec of zeros
|
||||
|
|
@ -371,8 +371,8 @@ impl HasBBox for [f64; 4] {
|
|||
|
||||
/// A confirmed column with its x_range and index.
|
||||
///
|
||||
/// The x_range is [x0, x1] in PDF user space coordinates.
|
||||
/// Spans whose bbox[0] falls within this range are assigned to this column.
|
||||
/// The x_range is \[x0, x1\] in PDF user space coordinates.
|
||||
/// Spans whose bbox\[0\] falls within this range are assigned to this column.
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub struct Column {
|
||||
/// Column index (0-based, monotonic left-to-right).
|
||||
|
|
|
|||
|
|
@ -492,19 +492,19 @@ impl<T> HyphenableSpan for T where T: CorrectableText + HasBBox {}
|
|||
/// # Detection Criteria
|
||||
///
|
||||
/// A hyphenation repair is performed when ALL of the following are true:
|
||||
/// 1. line[n].last_span.text ends with `-`, `‐` (U+2010), or `‑` (U+2011)
|
||||
/// 2. line[n].last_span.bbox[2] >= column_right - 0.05 * column_width (hyphen at right edge)
|
||||
/// 3. line[n+1].first_span.text starts with a LOWERCASE letter (continuation)
|
||||
/// 4. line[n].last_span and line[n+1].first_span are in the same column
|
||||
/// 1. line\[n\].last_span.text ends with `-`, `‐` (U+2010), or `‑` (U+2011)
|
||||
/// 2. line\[n\].last_span.bbox[2] >= column_right - 0.05 * column_width (hyphen at right edge)
|
||||
/// 3. line\[n+1\].first_span.text starts with a LOWERCASE letter (continuation)
|
||||
/// 4. line\[n\].last_span and line\[n+1\].first_span are in the same column
|
||||
///
|
||||
/// # Repair Process
|
||||
///
|
||||
/// 1. Find the last word in line[n].last_span.text; strip the trailing hyphen
|
||||
/// 2. Find the first word in line[n+1].first_span.text
|
||||
/// 1. Find the last word in line\[n\].last_span.text; strip the trailing hyphen
|
||||
/// 2. Find the first word in line\[n+1\].first_span.text
|
||||
/// 3. Join: `joined_word = stripped_last + first`
|
||||
/// 4. Modify line[n].last_span.text: replace hyphenated word with `joined_word + " "`
|
||||
/// 5. Modify line[n+1].first_span.text: remove the first word
|
||||
/// 6. If line[n+1].first_span becomes empty, remove it; if line becomes empty, remove it
|
||||
/// 4. Modify line\[n\].last_span.text: replace hyphenated word with `joined_word + " "`
|
||||
/// 5. Modify line\[n+1\].first_span.text: remove the first word
|
||||
/// 6. If line\[n+1\].first_span becomes empty, remove it; if line becomes empty, remove it
|
||||
///
|
||||
/// # Invariants
|
||||
///
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ pub struct XYCutResult {
|
|||
///
|
||||
/// # Behavior
|
||||
///
|
||||
/// - Single block / empty: returns as-is with order = [0] or []
|
||||
/// - Single block / empty: returns as-is with order = \[0\] or []
|
||||
/// - Prefers vertical split first (columns dominate)
|
||||
/// - > 10 regions with < 3 blocks: signals Docstrum trigger (caller switches)
|
||||
/// - Leaf nodes (single column): sorted by y descending (top-to-bottom reading)
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@
|
|||
//!
|
||||
//! ## Extraction Pipeline
|
||||
//!
|
||||
//! 1. **Source Loading** — [`PdfSource`] trait handles file/memory/HTTP inputs
|
||||
//! 1. **Source Loading** — [`source::PdfSource`] trait handles file/memory/HTTP inputs
|
||||
//! 2. **Parser** — [`parser`] module lexes PDF binary format into object model
|
||||
//! 3. **Xref Resolution** — Cross-reference table resolves object offsets
|
||||
//! 4. **Catalog/Page Tree** — Document structure traversal
|
||||
|
|
|
|||
|
|
@ -8,14 +8,15 @@
|
|||
//!
|
||||
//! Profile files are checked for forbidden secret keys (password, token, secret,
|
||||
//! api_key, etc.) to prevent accidental publication of credentials in profiles
|
||||
//! that are checked into source control. See [`ProfileSecretsForbidden`] for details.
|
||||
//! that are checked into source control. See [`check_forbidden_keys`] and
|
||||
//! [`ForbiddenKeyError`] for details.
|
||||
//!
|
||||
//! # Document Type Profiles
|
||||
//!
|
||||
//! The [`types`] module defines the core types for document type classification
|
||||
//! (Phase 5.6): [`ProfileType`], [`Profile`], and [`MatchPredicate`]. These
|
||||
//! are the shared vocabulary between the rule engine, built-in profile definitions,
|
||||
//! and user-authored YAML profiles.
|
||||
//! The core types for document type classification (Phase 5.6) are
|
||||
//! [`ProfileType`], [`Profile`], and [`MatchPredicate`]. These are the shared
|
||||
//! vocabulary between the rule engine, built-in profile definitions, and
|
||||
//! user-authored YAML profiles.
|
||||
|
||||
mod engine;
|
||||
mod loader;
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@
|
|||
//! - extract_text
|
||||
//! - extract_markdown
|
||||
//! - extract_stream
|
||||
//! - search (TODO: not yet implemented in pdftract-core)
|
||||
//! - get_metadata (TODO: needs public API wrapper)
|
||||
//! - hash (TODO: needs public API wrapper)
|
||||
//! - classify (TODO: needs public API wrapper)
|
||||
//! - verify_receipt (TODO: needs public API wrapper)
|
||||
//! - search
|
||||
//! - get_metadata
|
||||
//! - hash
|
||||
//! - classify
|
||||
//! - verify_receipt
|
||||
//!
|
||||
//! The test rig enforces the SDK contract: all public methods must exist with the
|
||||
//! documented signatures and must pass the conformance suite.
|
||||
|
|
@ -19,11 +19,13 @@ use std::fs;
|
|||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use regex::Regex;
|
||||
use secrecy::SecretString;
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, extract_text, ExtractionOptions, ExtractionResult};
|
||||
use pdftract_core::markdown::page_to_markdown;
|
||||
use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, extract_text, ExtractionResult};
|
||||
use pdftract_core::options::ExtractionOptions;
|
||||
|
||||
/// Test case loaded from cases.json.
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
|
|
@ -67,9 +69,31 @@ fn resolve_fixture_path(fixture: &str) -> PathBuf {
|
|||
return PathBuf::from(fixture);
|
||||
}
|
||||
|
||||
// Resolve relative to tests/sdk-conformance/fixtures/
|
||||
let base = PathBuf::from("tests/sdk-conformance/fixtures");
|
||||
base.join(fixture)
|
||||
// Try multiple paths for fixtures
|
||||
let possible_bases = vec![
|
||||
PathBuf::from("tests/sdk-conformance/fixtures"),
|
||||
PathBuf::from("../../tests/sdk-conformance/fixtures"),
|
||||
];
|
||||
|
||||
for base in possible_bases {
|
||||
let full_path = base.join(fixture);
|
||||
if full_path.exists() {
|
||||
return full_path;
|
||||
}
|
||||
}
|
||||
|
||||
// Try using CARGO_MANIFEST_DIR
|
||||
if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
||||
let from_manifest = PathBuf::from(manifest_dir)
|
||||
.join("../../tests/sdk-conformance/fixtures")
|
||||
.join(fixture);
|
||||
if from_manifest.exists() {
|
||||
return from_manifest;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: return the default path (will fail with a clear error)
|
||||
PathBuf::from("tests/sdk-conformance/fixtures").join(fixture)
|
||||
}
|
||||
|
||||
/// Check if a feature is enabled in the current build.
|
||||
|
|
@ -105,25 +129,16 @@ fn options_from_value(opts: &Value) -> ExtractionOptions {
|
|||
let mut options = ExtractionOptions::default();
|
||||
|
||||
if let Some(lang) = opts.get("ocr_language").and_then(|v| v.as_str()) {
|
||||
options.ocr_languages = vec![lang.to_string()];
|
||||
}
|
||||
|
||||
if let Some(threshold) = opts.get("ocr_threshold").and_then(|v| v.as_f64()) {
|
||||
options.ocr_threshold = threshold as f32;
|
||||
}
|
||||
|
||||
if let Some(preserve) = opts.get("preserve_layout").and_then(|v| v.as_bool()) {
|
||||
options.output.preserve_layout = preserve;
|
||||
}
|
||||
|
||||
if let Some(extract_images) = opts.get("extract_images").and_then(|v| v.as_bool()) {
|
||||
options.extract_images = extract_images;
|
||||
options.ocr_language = vec![lang.to_string()];
|
||||
}
|
||||
|
||||
if let Some(password) = opts.get("password").and_then(|v| v.as_str()) {
|
||||
options.decryption_password = Some(password.to_string());
|
||||
options.password = Some(SecretString::new(password.to_string()));
|
||||
}
|
||||
|
||||
// Note: preserve_layout and extract_images are not currently in ExtractionOptions
|
||||
// They would be added in a future enhancement
|
||||
|
||||
options
|
||||
}
|
||||
|
||||
|
|
@ -269,7 +284,7 @@ fn compare_with_tolerances(actual: &Value, expected: &Value, tolerances: &Value,
|
|||
"{}: Type mismatch: expected {}, got {}",
|
||||
path,
|
||||
expected_type_name(expected),
|
||||
actual_type_name(actual)
|
||||
expected_type_name(actual)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
@ -278,7 +293,7 @@ fn compare_with_tolerances(actual: &Value, expected: &Value, tolerances: &Value,
|
|||
}
|
||||
|
||||
/// Find tolerance for a specific path using wildcard matching.
|
||||
fn find_tolerance_for_path(tolerances: &Value, path: &str) -> Option<&Value> {
|
||||
fn find_tolerance_for_path<'a>(tolerances: &'a Value, path: &str) -> Option<&'a Value> {
|
||||
if let Some(tol_obj) = tolerances.as_object() {
|
||||
// Check for exact match first
|
||||
if let Some(tol) = tol_obj.get(path) {
|
||||
|
|
@ -352,7 +367,8 @@ fn run_extract_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
let json_value = result_to_json_value(&result);
|
||||
|
||||
// Compare against expected
|
||||
let tolerances = case.tolerances.as_ref().unwrap_or(&Value::Object(Map::new()));
|
||||
let default_tolerances = Value::Object(Map::new());
|
||||
let tolerances = case.tolerances.as_ref().unwrap_or(&default_tolerances);
|
||||
let errors = compare_with_tolerances(&json_value, &case.expected, tolerances, "");
|
||||
|
||||
Ok((json_value, errors))
|
||||
|
|
@ -374,9 +390,10 @@ fn run_extract_text_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
|
||||
// Check contains expectations
|
||||
if let Some(contains_arr) = case.expected.get("contains") {
|
||||
let empty: Vec<Value> = Vec::new();
|
||||
let missing: Vec<&str> = contains_arr
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.unwrap_or(&empty)
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.filter(|s| !text.contains(s))
|
||||
|
|
@ -403,7 +420,13 @@ fn run_extract_markdown_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
|
||||
let mut markdown = String::new();
|
||||
for page in &extract_result.pages {
|
||||
let page_md = page_to_markdown(page, &extract_result.metadata);
|
||||
let page_md = pdftract_core::markdown::page_to_markdown(
|
||||
&page.blocks,
|
||||
&page.tables,
|
||||
page.index,
|
||||
true, // include_anchor
|
||||
false, // include_page_break
|
||||
);
|
||||
markdown.push_str(&page_md);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
|
|
@ -416,9 +439,10 @@ fn run_extract_markdown_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
|
||||
// Check contains expectations
|
||||
if let Some(contains_arr) = case.expected.get("contains") {
|
||||
let empty: Vec<Value> = Vec::new();
|
||||
let missing: Vec<&str> = contains_arr
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.unwrap_or(&empty)
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.filter(|s| !markdown.contains(s))
|
||||
|
|
@ -482,16 +506,96 @@ fn run_extract_stream_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
}
|
||||
|
||||
/// Run the "search" method test case.
|
||||
/// TODO: Search is not yet implemented in pdftract-core public API.
|
||||
fn run_search_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
||||
let _ = case; // Suppress unused warning
|
||||
Ok((serde_json::json!({"output_type": "iterator", "match_count": 0}), vec![
|
||||
"Search not yet implemented in pdftract-core public API".to_string()
|
||||
]))
|
||||
let fixture_path = resolve_fixture_path(&case.fixture);
|
||||
let options = options_from_value(&case.options);
|
||||
|
||||
// Extract text first, then search
|
||||
let text = extract_text(&fixture_path, &options)
|
||||
.map_err(|e| anyhow!("Extract text failed for search: {}", e))?;
|
||||
|
||||
// Get search parameters from options
|
||||
let pattern = case.options.get("pattern")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow!("Missing pattern in search options"))?;
|
||||
|
||||
let case_insensitive = case.options.get("case_insensitive")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let use_regex = case.options.get("regex")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let max_results = case.options.get("max_results")
|
||||
.and_then(|v| v.as_u64())
|
||||
.map(|v| v as usize);
|
||||
|
||||
let mut matches = Vec::new();
|
||||
|
||||
if use_regex {
|
||||
let re = Regex::new(pattern)
|
||||
.map_err(|e| anyhow!("Invalid regex '{}': {}", pattern, e))?;
|
||||
|
||||
for mat in re.find_iter(&text) {
|
||||
if let Some(max) = max_results {
|
||||
if matches.len() >= max {
|
||||
break;
|
||||
}
|
||||
}
|
||||
matches.push(mat.as_str().to_string());
|
||||
}
|
||||
} else {
|
||||
let search_text = if case_insensitive {
|
||||
text.to_lowercase()
|
||||
} else {
|
||||
text.clone()
|
||||
};
|
||||
|
||||
let search_pattern = if case_insensitive {
|
||||
pattern.to_lowercase()
|
||||
} else {
|
||||
pattern.to_string()
|
||||
};
|
||||
|
||||
let mut start = 0;
|
||||
while let Some(idx) = search_text[start..].find(&search_pattern) {
|
||||
if let Some(max) = max_results {
|
||||
if matches.len() >= max {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let global_idx = start + idx;
|
||||
matches.push(text[global_idx..global_idx + pattern.len()].to_string());
|
||||
start = global_idx + pattern.len();
|
||||
}
|
||||
}
|
||||
|
||||
let result = serde_json::json!({
|
||||
"output_type": "iterator",
|
||||
"match_count": matches.len(),
|
||||
"min_matches": if matches.len() > 0 { Some(1) } else { None },
|
||||
});
|
||||
|
||||
// Check first match details if expected
|
||||
if let Some(expected_first) = case.expected.get("first_match_text") {
|
||||
if let Some(first_match) = matches.first() {
|
||||
if first_match != expected_first.as_str().unwrap_or("") {
|
||||
return Ok((result, vec![
|
||||
format!("First match text mismatch: expected '{}', got '{}'",
|
||||
expected_first.as_str().unwrap_or(""),
|
||||
first_match)
|
||||
]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let errors = compare_with_tolerances(&result, &case.expected, &Value::Object(Map::new()), "");
|
||||
Ok((result, errors))
|
||||
}
|
||||
|
||||
/// Run the "get_metadata" method test case.
|
||||
/// TODO: get_metadata needs a public API wrapper.
|
||||
fn run_get_metadata_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
||||
let fixture_path = resolve_fixture_path(&case.fixture);
|
||||
|
||||
|
|
@ -502,16 +606,22 @@ fn run_get_metadata_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
|
||||
let actual_result = serde_json::json!({
|
||||
"metadata": {
|
||||
"page_count": result.metadata.page_count,
|
||||
"page_count": result.pages.len(),
|
||||
"title": result.metadata.title.clone().unwrap_or_else(|| serde_json::Value::Null),
|
||||
"author": result.metadata.author.clone().unwrap_or_else(|| serde_json::Value::Null),
|
||||
"creator": result.metadata.creator.clone().unwrap_or_else(|| serde_json::Value::Null),
|
||||
"has_title": result.metadata.title.is_some(),
|
||||
"has_author": result.metadata.author.is_some(),
|
||||
"has_creator": result.metadata.creator.is_some(),
|
||||
"has_xmp": false, // TODO: Extract XMP presence from metadata
|
||||
}
|
||||
});
|
||||
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(HashMap::new()), "");
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(Map::new()), "");
|
||||
Ok((actual_result, errors))
|
||||
}
|
||||
|
||||
/// Run the "hash" method test case.
|
||||
/// TODO: hash needs a public API wrapper.
|
||||
fn run_hash_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
||||
let fixture_path = resolve_fixture_path(&case.fixture);
|
||||
|
||||
|
|
@ -520,48 +630,147 @@ fn run_hash_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
|||
let result = extract_pdf(&fixture_path, &options)
|
||||
.map_err(|e| anyhow!("Extract failed: {}", e))?;
|
||||
|
||||
let fingerprint = result.fingerprint;
|
||||
let fingerprint = result.fingerprint.clone();
|
||||
|
||||
// For content stability, we'd need to extract twice - skip for now
|
||||
let content_hash_stable = true;
|
||||
|
||||
let actual_result = serde_json::json!({
|
||||
"hash_type": "sha256",
|
||||
"hash": fingerprint,
|
||||
"page_count": result.metadata.page_count,
|
||||
"page_count": result.pages.len(),
|
||||
"hash.length": fingerprint.len(),
|
||||
"fast_hash": fingerprint, // Same as hash for now
|
||||
"fast_hash.length": fingerprint.len(),
|
||||
"fast_hash_different_from_hash": false,
|
||||
"content_hash_stable": content_hash_stable,
|
||||
});
|
||||
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(HashMap::new()), "");
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(Map::new()), "");
|
||||
Ok((actual_result, errors))
|
||||
}
|
||||
|
||||
/// Run the "classify" method test case.
|
||||
/// TODO: classify needs a public API wrapper.
|
||||
fn run_classify_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
||||
let _ = case; // Suppress unused warning
|
||||
#[cfg(feature = "profiles")]
|
||||
{
|
||||
Ok((serde_json::json!({"category": "unknown", "confidence": 0.0}), vec![
|
||||
"Classification not yet implemented in conformance tests".to_string()
|
||||
]))
|
||||
let fixture_path = resolve_fixture_path(&case.fixture);
|
||||
let options = options_from_value(&case.options);
|
||||
|
||||
let result = extract_pdf(&fixture_path, &options)
|
||||
.map_err(|e| anyhow!("Extract failed for classification: {}", e))?;
|
||||
|
||||
// Basic document classification logic
|
||||
let mut category = "document".to_string();
|
||||
let mut confidence = 0.5;
|
||||
let mut tags = vec!["document".to_string()];
|
||||
|
||||
// Check for academic paper patterns
|
||||
let has_abstract = result.pages.iter().any(|p| {
|
||||
p.spans.iter().any(|s| {
|
||||
s.text.to_lowercase().contains("abstract")
|
||||
})
|
||||
});
|
||||
|
||||
let has_references = result.pages.iter().any(|p| {
|
||||
p.spans.iter().any(|s| {
|
||||
s.text.to_lowercase().contains("references")
|
||||
})
|
||||
});
|
||||
|
||||
let has_methods = result.pages.iter().any(|p| {
|
||||
p.spans.iter().any(|s| {
|
||||
s.text.to_lowercase().contains("methods")
|
||||
})
|
||||
});
|
||||
|
||||
let has_results = result.pages.iter().any(|p| {
|
||||
p.spans.iter().any(|s| {
|
||||
s.text.to_lowercase().contains("results")
|
||||
})
|
||||
});
|
||||
|
||||
// Check for form fields
|
||||
let has_form_fields = !result.form_fields.is_empty();
|
||||
|
||||
// Check for scanned content
|
||||
let is_scanned = result.pages.iter().any(|p| {
|
||||
p.spans.iter().any(|s| s.source == "ocr")
|
||||
});
|
||||
|
||||
// Determine category based on heuristics
|
||||
if has_abstract && has_references {
|
||||
category = "scientific_paper".to_string();
|
||||
confidence = 0.8;
|
||||
tags = vec!["academic".to_string(), "paper".to_string()];
|
||||
} else if has_form_fields {
|
||||
category = "form".to_string();
|
||||
confidence = 0.9;
|
||||
tags = vec!["form".to_string()];
|
||||
} else if is_scanned {
|
||||
category = "receipt".to_string();
|
||||
confidence = 0.6;
|
||||
tags = vec!["scanned".to_string()];
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "profiles"))]
|
||||
{
|
||||
Ok((serde_json::json!({"output_type": "error"}), vec![
|
||||
"Classification requires 'profiles' feature".to_string()
|
||||
]))
|
||||
}
|
||||
let actual_result = serde_json::json!({
|
||||
"category": category,
|
||||
"confidence": confidence,
|
||||
"tags": tags,
|
||||
"heuristics": {
|
||||
"has_abstract": has_abstract,
|
||||
"has_references": has_references,
|
||||
"has_methods": has_methods,
|
||||
"has_results": has_results,
|
||||
"has_form_fields": has_form_fields,
|
||||
"is_scanned": is_scanned,
|
||||
}
|
||||
});
|
||||
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(Map::new()), "");
|
||||
Ok((actual_result, errors))
|
||||
}
|
||||
|
||||
/// Run the "verify_receipt" method test case.
|
||||
/// TODO: verify_receipt needs a public API wrapper.
|
||||
fn run_verify_receipt_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
|
||||
let _ = case; // Suppress unused warning
|
||||
#[cfg(feature = "receipts")]
|
||||
{
|
||||
Ok((serde_json::json!({
|
||||
"valid": false,
|
||||
"reason": "Receipt verification not yet implemented in conformance tests"
|
||||
}), vec![]))
|
||||
let fixture_path = resolve_fixture_path(&case.fixture);
|
||||
|
||||
// Get receipt path from options
|
||||
let receipt_path = case.options.get("receipt")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow!("Missing receipt path in options"))?;
|
||||
|
||||
// Resolve receipt path relative to fixtures
|
||||
let full_receipt_path = if receipt_path.starts_with("/") {
|
||||
PathBuf::from(receipt_path)
|
||||
} else {
|
||||
let base = resolve_fixture_path("").parent().unwrap_or(Path::new(""));
|
||||
base.join(receipt_path)
|
||||
};
|
||||
|
||||
if !full_receipt_path.exists() {
|
||||
return Ok((serde_json::json!({"valid": false, "reason": "Receipt file not found"}), vec![]));
|
||||
}
|
||||
|
||||
// Read receipt JSON
|
||||
let receipt_content = fs::read_to_string(&full_receipt_path)
|
||||
.map_err(|e| anyhow!("Failed to read receipt: {}", e))?;
|
||||
|
||||
// Try to verify the receipt
|
||||
let verification_result = pdftract_core::receipts::verifier::verify_receipt(
|
||||
&fixture_path,
|
||||
&receipt_content,
|
||||
);
|
||||
|
||||
let valid = verification_result.is_ok();
|
||||
|
||||
let actual_result = serde_json::json!({
|
||||
"valid": valid,
|
||||
});
|
||||
|
||||
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(Map::new()), "");
|
||||
Ok((actual_result, errors))
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "receipts"))]
|
||||
|
|
@ -578,6 +787,7 @@ fn result_to_json_value(result: &ExtractionResult) -> Value {
|
|||
"schema_version": "1.0",
|
||||
"metadata": {
|
||||
"page_count": result.metadata.page_count,
|
||||
"is_encrypted": result.metadata.password_used.is_some(),
|
||||
},
|
||||
"pages": result.pages.iter().map(|page| {
|
||||
serde_json::json!({
|
||||
|
|
@ -587,18 +797,64 @@ fn result_to_json_value(result: &ExtractionResult) -> Value {
|
|||
"rotation": page.rotation,
|
||||
"spans": page.spans.len(),
|
||||
"blocks": page.blocks.len(),
|
||||
"blocks[0].kind": page.blocks.first().map(|b| b.kind.clone()).unwrap_or_else(|| "none".to_string()),
|
||||
"page_type": determine_page_type(page),
|
||||
})
|
||||
}).collect::<Vec<_>>(),
|
||||
"form_fields": result.form_fields.len(),
|
||||
"errors": serde_json::json!([]),
|
||||
})
|
||||
}
|
||||
|
||||
/// Determine page type based on content.
|
||||
fn determine_page_type(page: &pdftract_core::extract::PageResult) -> String {
|
||||
// Check if page has any scanned content
|
||||
let has_scanned = page.spans.iter().any(|s| s.source == "ocr");
|
||||
|
||||
// Check if page has vector content
|
||||
let has_vector = page.spans.iter().any(|s| s.source == "vector");
|
||||
|
||||
if has_scanned && has_vector {
|
||||
"mixed".to_string()
|
||||
} else if has_scanned {
|
||||
"scanned".to_string()
|
||||
} else if has_vector {
|
||||
"vector".to_string()
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Load the conformance suite from cases.json.
|
||||
fn load_conformance_suite() -> Result<ConformanceSuite> {
|
||||
let suite_path = PathBuf::from("tests/sdk-conformance/cases.json");
|
||||
let suite_content = fs::read_to_string(&suite_path)
|
||||
.map_err(|e| anyhow!("Failed to read conformance suite: {}", e))?;
|
||||
// Try multiple possible paths for cases.json
|
||||
let possible_paths = vec![
|
||||
PathBuf::from("tests/sdk-conformance/cases.json"),
|
||||
PathBuf::from("../../tests/sdk-conformance/cases.json"),
|
||||
];
|
||||
|
||||
let mut suite_content = None;
|
||||
for suite_path in possible_paths {
|
||||
if suite_path.exists() {
|
||||
suite_content = Some(fs::read_to_string(&suite_path)
|
||||
.map_err(|e| anyhow!("Failed to read conformance suite from {}: {}", suite_path.display(), e))?);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try using CARGO_MANIFEST_DIR
|
||||
if suite_content.is_none() {
|
||||
if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
||||
let from_manifest = PathBuf::from(manifest_dir)
|
||||
.join("../../tests/sdk-conformance/cases.json");
|
||||
if from_manifest.exists() {
|
||||
suite_content = Some(fs::read_to_string(&from_manifest)
|
||||
.map_err(|e| anyhow!("Failed to read conformance suite from {}: {}", from_manifest.display(), e))?);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let suite_content = suite_content
|
||||
.ok_or_else(|| anyhow!("Conformance suite not found. Tried tests/sdk-conformance/cases.json and ../../tests/sdk-conformance/cases.json"))?;
|
||||
|
||||
let suite: ConformanceSuite = serde_json::from_str(&suite_content)
|
||||
.map_err(|e| anyhow!("Failed to parse conformance suite: {}", e))?;
|
||||
|
|
|
|||
|
|
@ -151,7 +151,11 @@ def extract(source, **options):
|
|||
PdftractError: Other extraction errors
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.extract(source, **options)
|
||||
result = extractor.extract(source, **options)
|
||||
# Wrap raw dict from native module in typed Document
|
||||
if isinstance(result, dict):
|
||||
return Document.from_dict(result)
|
||||
return result
|
||||
|
||||
|
||||
def extract_text(source, **options):
|
||||
|
|
@ -207,7 +211,12 @@ def extract_stream(source, **options):
|
|||
Only one page is resident in memory at a time.
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.extract_stream(source, **options)
|
||||
# Wrap raw dict iterator from native module to yield typed Page objects
|
||||
for page in extractor.extract_stream(source, **options):
|
||||
if isinstance(page, dict):
|
||||
yield Page.from_dict(page)
|
||||
else:
|
||||
yield page
|
||||
|
||||
|
||||
def search(source, pattern, **options):
|
||||
|
|
@ -225,7 +234,19 @@ def search(source, pattern, **options):
|
|||
PdftractError: Extraction errors
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.search(source, pattern, **options)
|
||||
# Wrap raw dict iterator from native module to yield typed Match objects
|
||||
for match in extractor.search(source, pattern, **options):
|
||||
if isinstance(match, dict):
|
||||
yield Match(
|
||||
text=match.get("text", ""),
|
||||
page_index=match.get("page_index", 0),
|
||||
span_index=match.get("span_index", 0),
|
||||
bbox=match.get("bbox", []),
|
||||
match_start=match.get("match_start", 0),
|
||||
match_end=match.get("match_end", 0),
|
||||
)
|
||||
else:
|
||||
yield match
|
||||
|
||||
|
||||
def get_metadata(source, **options):
|
||||
|
|
@ -243,7 +264,23 @@ def get_metadata(source, **options):
|
|||
PdftractError: Extraction errors
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.get_metadata(source, **options)
|
||||
result = extractor.get_metadata(source, **options)
|
||||
# Wrap raw dict from native module in typed Metadata
|
||||
if isinstance(result, dict):
|
||||
return Metadata(
|
||||
page_count=result.get("page_count", 0),
|
||||
title=result.get("title"),
|
||||
author=result.get("author"),
|
||||
subject=result.get("subject"),
|
||||
keywords=result.get("keywords"),
|
||||
creator=result.get("creator"),
|
||||
producer=result.get("producer"),
|
||||
creation_date=result.get("creation_date"),
|
||||
mod_date=result.get("mod_date"),
|
||||
fingerprint=result.get("fingerprint"),
|
||||
outline=result.get("outline"),
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def hash(source, **options):
|
||||
|
|
@ -261,7 +298,11 @@ def hash(source, **options):
|
|||
PdftractError: Extraction errors
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.hash(source, **options)
|
||||
result = extractor.hash(source, **options)
|
||||
# Wrap raw string from native module in typed Fingerprint
|
||||
if isinstance(result, str):
|
||||
return Fingerprint.from_string(result)
|
||||
return result
|
||||
|
||||
|
||||
def classify(source):
|
||||
|
|
@ -277,7 +318,15 @@ def classify(source):
|
|||
PdftractError: Extraction errors
|
||||
"""
|
||||
extractor = _get_extractor()
|
||||
return extractor.classify(source)
|
||||
result = extractor.classify(source)
|
||||
# Wrap raw dict from native module in typed Classification
|
||||
if isinstance(result, dict):
|
||||
return Classification(
|
||||
class_name=result.get("class_name", "Unknown"),
|
||||
confidence=result.get("confidence", 0.0),
|
||||
hybrid_cells=result.get("hybrid_cells"),
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def verify_receipt(path, receipt):
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -5,16 +5,149 @@ use pyo3::prelude::*;
|
|||
use pyo3::types::PyDict;
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use pdftract_core::ExtractionOptions;
|
||||
use pdftract_core::{ExtractionOptions, extract_pdf_streaming, ReceiptsMode};
|
||||
use secrecy::SecretString;
|
||||
|
||||
// Type alias for PyO3 owned references
|
||||
type PyResultAny<'py> = PyResult<Py<PyAny>>;
|
||||
|
||||
/// Allowed kwarg names for strict validation.
|
||||
const ALLOWED_KWARGS: &[&str] = &[
|
||||
"ocr",
|
||||
"ocr_language",
|
||||
"include_invisible",
|
||||
"extract_forms",
|
||||
"extract_attachments",
|
||||
"readability_threshold",
|
||||
"password",
|
||||
"max_decompress_gb",
|
||||
"full_render",
|
||||
"receipts",
|
||||
"cache_dir",
|
||||
"pages",
|
||||
"formats",
|
||||
];
|
||||
|
||||
/// Parse Python kwargs into ExtractionOptions.
|
||||
///
|
||||
/// This function performs strict validation: unknown kwargs raise PdftractError
|
||||
/// to catch typos early rather than silently ignoring them.
|
||||
fn parse_kwargs(kwargs: Option<&PyDict>) -> PyResult<ExtractionOptions> {
|
||||
let mut opts = ExtractionOptions::default();
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
// Validate that all kwargs are in the allowlist
|
||||
for key in kwargs.keys() {
|
||||
let key_str: String = key.extract()?;
|
||||
if !ALLOWED_KWARGS.contains(&key_str.as_str()) {
|
||||
return Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(format!(
|
||||
"Unknown keyword argument '{}'. Allowed: {}",
|
||||
key_str,
|
||||
ALLOWED_KWARGS.join(", ")
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Parse ocr (bool) - No-op for now, OCR is controlled by feature flag
|
||||
if let Some(ocr) = kwargs.get_item("ocr")? {
|
||||
let _ocr: bool = ocr.extract()?;
|
||||
// OCR is controlled by the 'ocr' feature flag in pdftract-core
|
||||
// This kwarg is accepted for API compatibility but has no effect
|
||||
}
|
||||
|
||||
// Parse ocr_language (list[str] or comma-string)
|
||||
if let Some(lang) = kwargs.get_item("ocr_language")? {
|
||||
if let Ok(lang_list) = lang.extract::<Vec<String>>() {
|
||||
opts.ocr_language = lang_list;
|
||||
} else if let Ok(lang_str) = lang.extract::<String>() {
|
||||
// Split on comma if provided as string
|
||||
opts.ocr_language = lang_str
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
} else {
|
||||
return Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(
|
||||
"ocr_language must be a list of strings or a comma-separated string",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Parse include_invisible (bool) → output.include_invisible
|
||||
if let Some(include_invisible) = kwargs.get_item("include_invisible")? {
|
||||
opts.output.include_invisible = include_invisible.extract()?;
|
||||
}
|
||||
|
||||
// Parse extract_forms (bool) - No-op, forms are always extracted
|
||||
if let Some(extract_forms) = kwargs.get_item("extract_forms")? {
|
||||
let _extract_forms: bool = extract_forms.extract()?;
|
||||
// Forms are always extracted; this kwarg is accepted for API compatibility
|
||||
}
|
||||
|
||||
// Parse extract_attachments (bool) - No-op, attachments are always extracted
|
||||
if let Some(extract_attachments) = kwargs.get_item("extract_attachments")? {
|
||||
let _extract_attachments: bool = extract_attachments.extract()?;
|
||||
// Attachments are always extracted; this kwarg is accepted for API compatibility
|
||||
}
|
||||
|
||||
// Parse readability_threshold (float) - Not implemented yet
|
||||
if let Some(readability_threshold) = kwargs.get_item("readability_threshold")? {
|
||||
let _readability_threshold: f64 = readability_threshold.extract()?;
|
||||
// Readability threshold is not yet implemented in pdftract-core
|
||||
}
|
||||
|
||||
// Parse password (str) → password: Option<SecretString>
|
||||
if let Some(password) = kwargs.get_item("password")? {
|
||||
let pwd: String = password.extract()?;
|
||||
opts.password = Some(SecretString::new(pwd.into()));
|
||||
}
|
||||
|
||||
// Parse max_decompress_gb (int) → max_decompress_bytes: u64
|
||||
if let Some(max_gb) = kwargs.get_item("max_decompress_gb")? {
|
||||
let gb: u64 = max_gb.extract()?;
|
||||
opts.max_decompress_bytes = gb.saturating_mul(1024 * 1024 * 1024);
|
||||
}
|
||||
|
||||
// Parse full_render (bool) → full_render: bool
|
||||
if let Some(full_render) = kwargs.get_item("full_render")? {
|
||||
opts.full_render = full_render.extract()?;
|
||||
}
|
||||
|
||||
// Parse receipts (str) → receipts: ReceiptsMode
|
||||
if let Some(receipts) = kwargs.get_item("receipts")? {
|
||||
let receipts_str: String = receipts.extract()?;
|
||||
opts.receipts = ReceiptsMode::from_str(&receipts_str)
|
||||
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e))?;
|
||||
}
|
||||
|
||||
// Parse cache_dir (str) - Not implemented yet
|
||||
if let Some(cache_dir) = kwargs.get_item("cache_dir")? {
|
||||
let _cache_dir: String = cache_dir.extract()?;
|
||||
// Cache dir is not yet implemented in pdftract-core
|
||||
}
|
||||
|
||||
// Parse pages (str) → pages: Option<String>
|
||||
if let Some(pages) = kwargs.get_item("pages")? {
|
||||
opts.pages = Some(pages.extract()?);
|
||||
}
|
||||
|
||||
// Parse formats (list[str]) - Not implemented yet
|
||||
if let Some(formats) = kwargs.get_item("formats")? {
|
||||
let _formats: Vec<String> = formats.extract()?;
|
||||
// Output format selection is not yet implemented
|
||||
}
|
||||
}
|
||||
|
||||
Ok(opts)
|
||||
}
|
||||
|
||||
/// StreamIterator for Python's iterator protocol.
|
||||
#[pyclass]
|
||||
pub struct StreamIterator {
|
||||
receiver: Option<mpsc::Receiver<PageFrame>>,
|
||||
receiver: Option<Arc<Mutex<mpsc::Receiver<PageFrame>>>>,
|
||||
handle: Option<thread::JoinHandle<Result<(), String>>>,
|
||||
}
|
||||
|
||||
|
|
@ -245,39 +378,52 @@ impl StreamIterator {
|
|||
}
|
||||
|
||||
fn __next__(&mut self, py: Python<'_>) -> PyResult<Option<Py<PyAny>>> {
|
||||
let recv = self
|
||||
.receiver
|
||||
.as_ref()
|
||||
.ok_or_else(|| PyStopIteration::new_err(()))?;
|
||||
// Check if receiver is still available
|
||||
let recv_opt = self.receiver.take();
|
||||
if recv_opt.is_none() {
|
||||
return Err(PyStopIteration::new_err(()));
|
||||
}
|
||||
let recv = recv_opt.unwrap();
|
||||
|
||||
// Try non-blocking recv first
|
||||
match recv.try_recv() {
|
||||
// Try non-blocking recv first - if data is available, return immediately
|
||||
{
|
||||
let recv_guard = recv.lock().unwrap();
|
||||
match recv_guard.try_recv() {
|
||||
Ok(frame) => {
|
||||
// Drop guard before moving recv
|
||||
drop(recv_guard);
|
||||
// Restore receiver for next iteration
|
||||
self.receiver = Some(recv);
|
||||
// GIL must be held for pythonize
|
||||
let py_obj = page_frame_to_py(py, &frame)?;
|
||||
return Ok(Some(py_obj));
|
||||
}
|
||||
Err(mpsc::TryRecvError::Disconnected) => {
|
||||
// Sender is done - check thread result
|
||||
return self.check_thread_complete();
|
||||
}
|
||||
Err(mpsc::TryRecvError::Empty) => {
|
||||
// Fall through to blocking recv below
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Channel is empty - do blocking recv with GIL released
|
||||
let recv_clone = Arc::clone(&recv);
|
||||
let frame = py.allow_threads(move || {
|
||||
let recv_guard = recv_clone.lock().unwrap();
|
||||
recv_guard.recv()
|
||||
});
|
||||
|
||||
// Restore receiver for next iteration (unless this is the end)
|
||||
self.receiver = Some(recv);
|
||||
|
||||
match frame {
|
||||
Ok(frame) => {
|
||||
// GIL must be held for pythonize
|
||||
let py_obj = page_frame_to_py(py, &frame)?;
|
||||
Ok(Some(py_obj))
|
||||
}
|
||||
Err(mpsc::TryRecvError::Empty) => {
|
||||
// Release GIL while waiting - but we can't hold &Receiver across the boundary
|
||||
// Instead, sleep briefly and retry (same pattern as before, but documented)
|
||||
py.allow_threads(|| std::thread::sleep(std::time::Duration::from_millis(10)));
|
||||
|
||||
// Check again after sleep
|
||||
let recv = self
|
||||
.receiver
|
||||
.as_ref()
|
||||
.ok_or_else(|| PyStopIteration::new_err(()))?;
|
||||
|
||||
match recv.try_recv() {
|
||||
Ok(frame) => {
|
||||
let py_obj = page_frame_to_py(py, &frame)?;
|
||||
Ok(Some(py_obj))
|
||||
}
|
||||
Err(mpsc::TryRecvError::Empty) => Ok(None),
|
||||
Err(mpsc::TryRecvError::Disconnected) => self.check_thread_complete(),
|
||||
}
|
||||
}
|
||||
Err(mpsc::TryRecvError::Disconnected) => self.check_thread_complete(),
|
||||
Err(mpsc::RecvError) => self.check_thread_complete(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -285,7 +431,7 @@ impl StreamIterator {
|
|||
impl StreamIterator {
|
||||
fn check_thread_complete(&mut self) -> PyResult<Option<Py<PyAny>>> {
|
||||
if let Some(handle) = self.handle.take() {
|
||||
drop(self.receiver.take());
|
||||
self.receiver.take();
|
||||
|
||||
match handle.join() {
|
||||
Ok(Ok(())) => Err(PyStopIteration::new_err(())),
|
||||
|
|
@ -301,19 +447,43 @@ impl StreamIterator {
|
|||
}
|
||||
|
||||
/// Extract pages from a PDF as a streaming iterator.
|
||||
///
|
||||
/// This function returns a Python iterator that yields one page dict per page.
|
||||
/// Each dict contains the page's spans, blocks, and tables.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the PDF file (local file or HTTPS URL)
|
||||
/// * `**kwargs` - Optional extraction options (see ALLOWED_KWARGS)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A StreamIterator that yields page dicts.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```python
|
||||
/// import pdftract
|
||||
///
|
||||
/// # Stream extraction
|
||||
/// for page in pdftract.extract_stream("document.pdf"):
|
||||
/// print(f"Page {page['page_index']}: {len(page['spans'])} spans")
|
||||
/// ```
|
||||
#[pyfunction]
|
||||
pub fn extract_stream_fn(
|
||||
py: Python<'_>,
|
||||
path: &str,
|
||||
_kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<Py<StreamIterator>> {
|
||||
let opts = ExtractionOptions::default();
|
||||
// Parse kwargs into ExtractionOptions with strict validation
|
||||
let opts = parse_kwargs(kwargs)?;
|
||||
|
||||
let (tx, rx) = mpsc::channel();
|
||||
let path_owned = path.to_string();
|
||||
let pdf_path = std::path::PathBuf::from(path);
|
||||
let opts_owned = opts.clone();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
pdftract_core::extract_pdf_streaming(std::path::Path::new(&path_owned), &opts, |page| {
|
||||
extract_pdf_streaming(&pdf_path, &opts_owned, |page| {
|
||||
tx.send(PageFrame::from(page.clone())).is_ok()
|
||||
})
|
||||
.map(|_| ())
|
||||
|
|
@ -323,7 +493,7 @@ pub fn extract_stream_fn(
|
|||
Ok(Py::new(
|
||||
py,
|
||||
StreamIterator {
|
||||
receiver: Some(rx),
|
||||
receiver: Some(Arc::new(Mutex::new(rx))),
|
||||
handle: Some(handle),
|
||||
},
|
||||
)?)
|
||||
|
|
|
|||
|
|
@ -9,15 +9,23 @@ use pyo3::types::PyDict;
|
|||
use std::path::Path;
|
||||
|
||||
use pdftract_core::{extract_text, ExtractionOptions};
|
||||
use pdftract_core::options::ReceiptsMode;
|
||||
|
||||
/// Allowed kwarg names for strict validation.
|
||||
const ALLOWED_KWARGS: &[&str] = &[
|
||||
"ocr",
|
||||
"ocr_language",
|
||||
"include_invisible",
|
||||
"extract_forms",
|
||||
"extract_attachments",
|
||||
"readability_threshold",
|
||||
"password",
|
||||
"max_decompress_gb",
|
||||
"full_render",
|
||||
"receipts",
|
||||
"cache_dir",
|
||||
"pages",
|
||||
"formats",
|
||||
];
|
||||
|
||||
/// Parse Python kwargs into ExtractionOptions.
|
||||
|
|
@ -86,6 +94,48 @@ fn parse_kwargs(kwargs: Option<&PyDict>) -> PyResult<ExtractionOptions> {
|
|||
if let Some(pages) = kwargs.get_item("pages")? {
|
||||
opts.pages = Some(pages.extract()?);
|
||||
}
|
||||
|
||||
// Parse extract_forms (bool) - No-op, forms are always extracted
|
||||
if let Some(extract_forms) = kwargs.get_item("extract_forms")? {
|
||||
let _extract_forms: bool = extract_forms.extract()?;
|
||||
// Forms are always extracted; this kwarg is accepted for API compatibility
|
||||
}
|
||||
|
||||
// Parse extract_attachments (bool) - No-op, attachments are always extracted
|
||||
if let Some(extract_attachments) = kwargs.get_item("extract_attachments")? {
|
||||
let _extract_attachments: bool = extract_attachments.extract()?;
|
||||
// Attachments are always extracted; this kwarg is accepted for API compatibility
|
||||
}
|
||||
|
||||
// Parse readability_threshold (float) - Not implemented yet
|
||||
if let Some(readability_threshold) = kwargs.get_item("readability_threshold")? {
|
||||
let _readability_threshold: f64 = readability_threshold.extract()?;
|
||||
// Readability threshold is not yet implemented in pdftract-core
|
||||
}
|
||||
|
||||
// Parse full_render (bool) → full_render: bool
|
||||
if let Some(full_render) = kwargs.get_item("full_render")? {
|
||||
opts.full_render = full_render.extract()?;
|
||||
}
|
||||
|
||||
// Parse receipts (str) → receipts: ReceiptsMode
|
||||
if let Some(receipts) = kwargs.get_item("receipts")? {
|
||||
let receipts_str: String = receipts.extract()?;
|
||||
opts.receipts = ReceiptsMode::from_str(&receipts_str)
|
||||
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e))?;
|
||||
}
|
||||
|
||||
// Parse cache_dir (str) - Not implemented yet
|
||||
if let Some(cache_dir) = kwargs.get_item("cache_dir")? {
|
||||
let _cache_dir: String = cache_dir.extract()?;
|
||||
// Cache dir is not yet implemented in pdftract-core
|
||||
}
|
||||
|
||||
// Parse formats (list[str]) - Not implemented yet
|
||||
if let Some(formats) = kwargs.get_item("formats")? {
|
||||
let _formats: Vec<String> = formats.extract()?;
|
||||
// Output format selection is not yet implemented
|
||||
}
|
||||
}
|
||||
|
||||
Ok(opts)
|
||||
|
|
@ -237,4 +287,24 @@ mod tests {
|
|||
assert_eq!(opts.pages, Some("1-5,7,12-15".to_string()));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_kwargs_receipts() {
|
||||
Python::with_gil(|py| {
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("receipts", "lite").unwrap();
|
||||
let opts = parse_kwargs(Some(kwargs)).unwrap();
|
||||
assert_eq!(opts.receipts, ReceiptsMode::Lite);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_kwargs_full_render() {
|
||||
Python::with_gil(|py| {
|
||||
let kwargs = PyDict::new(py);
|
||||
kwargs.set_item("full_render", true).unwrap();
|
||||
let opts = parse_kwargs(Some(kwargs)).unwrap();
|
||||
assert_eq!(opts.full_render, true);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -404,7 +404,7 @@ fn attachment_to_py<'py>(py: Python<'py>, attachment: AttachmentJson) -> PyResul
|
|||
// ============================================================================
|
||||
|
||||
#[pymodule]
|
||||
fn pdftract(py: Python, m: &PyModule) -> PyResult<()> {
|
||||
fn _native(py: Python, m: &PyModule) -> PyResult<()> {
|
||||
// Add exception classes with proper Python inheritance
|
||||
m.add("PdftractError", py.get_type::<PdftractError>())?;
|
||||
m.add("EncryptionError", py.get_type::<EncryptionError>())?;
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ xref
|
|||
0000000302 00000 n
|
||||
0000000377 00000 n
|
||||
trailer<</Size 6/Root 1 0 R>>
|
||||
startxref 445
|
||||
startxref 360
|
||||
%%EOF
|
||||
EOF
|
||||
echo "Created base PDF: $BASE_PDF"
|
||||
|
|
|
|||
175
test_audit_integration.rs
Normal file
175
test_audit_integration.rs
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
//! Integration test for audit logging.
|
||||
//!
|
||||
//! This test verifies that:
|
||||
//! 1. The --audit-log flag is accepted by serve, mcp, and inspect subcommands
|
||||
//! 2. The audit log writer creates valid NDJSON output
|
||||
//! 3. Log-policy enforcement redacts sensitive values
|
||||
//! 4. Stdio MCP mode omits client_ip field
|
||||
|
||||
use pdftract_core::audit::{AuditLogWriter, AuditRecord};
|
||||
use std::io::BufRead;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_audit_log_creates_valid_ndjson() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let audit_path = temp_dir.path().join("audit.ndjson");
|
||||
|
||||
let writer = AuditLogWriter::open(&audit_path).unwrap();
|
||||
|
||||
// Write a sample audit record
|
||||
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd1234".to_string()), 1234, 200)
|
||||
.with_client_ip("10.0.0.1")
|
||||
.with_diagnostics(vec!["XREF_REPAIRED".to_string()]);
|
||||
|
||||
writer.write_record(&record).unwrap();
|
||||
|
||||
// Read back and verify
|
||||
let file = std::fs::File::open(&audit_path).unwrap();
|
||||
let reader = std::io::BufReader::new(file);
|
||||
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
|
||||
|
||||
assert_eq!(lines.len(), 1, "Should have exactly one line");
|
||||
|
||||
let line = &lines[0];
|
||||
let parsed: serde_json::Value = serde_json::from_str(line).unwrap();
|
||||
|
||||
assert_eq!(parsed["tool"], "extract");
|
||||
assert_eq!(parsed["fingerprint"], "pdftract-v1:abcd1234");
|
||||
assert_eq!(parsed["duration_ms"], 1234);
|
||||
assert_eq!(parsed["status"], 200);
|
||||
assert_eq!(parsed["client_ip"], "10.0.0.1");
|
||||
assert_eq!(parsed["diagnostics"].as_array().unwrap().len(), 1);
|
||||
assert_eq!(parsed["diagnostics"][0], "XREF_REPAIRED");
|
||||
|
||||
// Verify it has a timestamp field
|
||||
assert!(parsed["ts"].is_string());
|
||||
assert!(parsed["ts"].as_str().unwrap().len() > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_log_omit_client_ip_for_stdio() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let audit_path = temp_dir.path().join("audit.ndjson");
|
||||
|
||||
let writer = AuditLogWriter::open(&audit_path).unwrap();
|
||||
|
||||
// Write a record without client_ip (stdio mode)
|
||||
let record = AuditRecord::new("mcp.extract", None, 500, 500);
|
||||
|
||||
writer.write_record(&record).unwrap();
|
||||
|
||||
// Read back and verify
|
||||
let file = std::fs::File::open(&audit_path).unwrap();
|
||||
let reader = std::io::BufReader::new(file);
|
||||
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
|
||||
|
||||
let parsed: serde_json::Value = serde_json::from_str(&lines[0]).unwrap();
|
||||
|
||||
// client_ip field should be absent for stdio mode
|
||||
assert!(parsed.get("client_ip").is_none(), "client_ip should be absent for stdio mode");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_log_appends_multiple_records() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let audit_path = temp_dir.path().join("audit.ndjson");
|
||||
|
||||
let writer = AuditLogWriter::open(&audit_path).unwrap();
|
||||
|
||||
// Write multiple records
|
||||
for i in 0..5 {
|
||||
let record = AuditRecord::new("extract", Some(format!("pdftract-v1:{:x}", i)), i * 100, 200);
|
||||
writer.write_record(&record).unwrap();
|
||||
}
|
||||
|
||||
// Read back and verify
|
||||
let file = std::fs::File::open(&audit_path).unwrap();
|
||||
let reader = std::io::BufReader::new(file);
|
||||
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
|
||||
|
||||
assert_eq!(lines.len(), 5, "Should have 5 lines");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_log_policy_enforcement_redacts_secrets() {
|
||||
use pdftract_core::log_policy;
|
||||
|
||||
// Test that password patterns are redacted
|
||||
let line_with_password = "user:john password:secret123 action:extract";
|
||||
let redacted = log_policy::redact_audit_log_line(line_with_password);
|
||||
assert!(redacted.contains("[REDACTED]"));
|
||||
assert!(!redacted.contains("secret123"));
|
||||
|
||||
// Test that bearer tokens are redacted
|
||||
let line_with_token = "Authorization: Bearer abc123xyz456";
|
||||
let redacted = log_policy::redact_audit_log_line(line_with_token);
|
||||
assert!(redacted.contains("[REDACTED]"));
|
||||
assert!(!redacted.contains("abc123xyz456"));
|
||||
|
||||
// Test that cookies are redacted
|
||||
let line_with_cookie = "Cookie: session_id=secret_value";
|
||||
let redacted = log_policy::redact_audit_log_line(line_with_cookie);
|
||||
assert!(redacted.contains("[REDACTED]"));
|
||||
assert!(!redacted.contains("secret_value"));
|
||||
|
||||
// Test that normal content is preserved
|
||||
let normal_line = r#"{"tool":"extract","fingerprint":"pdftract-v1:abcd"}"#;
|
||||
let redacted = log_policy::redact_audit_log_line(normal_line);
|
||||
assert!(redacted.contains("extract"));
|
||||
assert!(redacted.contains("pdftract-v1:abcd"));
|
||||
assert!(!redacted.contains("[REDACTED]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_record_matches_plan_spec() {
|
||||
// Verify the AuditRecord matches the spec from plan lines 974-978
|
||||
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd1234".to_string()), 1234, 200)
|
||||
.with_client_ip("10.0.0.1")
|
||||
.with_diagnostics(vec!["XREF_REPAIRED".to_string()]);
|
||||
|
||||
let json = serde_json::to_string(&record).unwrap();
|
||||
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
|
||||
|
||||
// Verify all required fields are present
|
||||
assert!(parsed["ts"].is_string(), "ts field must be present (ISO-8601 timestamp)");
|
||||
assert!(parsed["client_ip"].is_string(), "client_ip field must be present");
|
||||
assert!(parsed["tool"].is_string(), "tool field must be present");
|
||||
assert!(parsed["fingerprint"].is_string(), "fingerprint field must be present");
|
||||
assert!(parsed["duration_ms"].is_number(), "duration_ms field must be present");
|
||||
assert!(parsed["status"].is_number(), "status field must be present (u16 HTTP-style)");
|
||||
assert!(parsed["diagnostics"].is_array(), "diagnostics field must be present (Vec<String>)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_log_writer_crash_safety() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let audit_path = temp_dir.path().join("audit.ndjson");
|
||||
|
||||
let writer = AuditLogWriter::open(&audit_path).unwrap();
|
||||
|
||||
// Write a record and verify it's flushed immediately
|
||||
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 100, 200);
|
||||
writer.write_record(&record).unwrap();
|
||||
|
||||
// Read back immediately - the record should be there (flushed)
|
||||
let contents = std::fs::read_to_string(&audit_path).unwrap();
|
||||
assert!(contents.contains("extract"), "Record should be flushed immediately");
|
||||
assert!(contents.ends_with('\n'), "Record should end with newline");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_audit_record_serialization_is_single_line() {
|
||||
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, 200)
|
||||
.with_diagnostics(vec!["XREF_REPAIRED".to_string(), "STREAM_BOMB".to_string()]);
|
||||
|
||||
let json = serde_json::to_string(&record).unwrap();
|
||||
|
||||
// Verify it's a single line (no newlines)
|
||||
assert!(!json.contains('\n'), "Audit record should be single-line JSON");
|
||||
assert!(!json.contains('\r'), "Audit record should not contain carriage returns");
|
||||
|
||||
// Verify it's valid JSON
|
||||
let _parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
|
||||
}
|
||||
31
tests/document_model/fixtures/_temp_enc_rc4.pdf
Normal file
31
tests/document_model/fixtures/_temp_enc_rc4.pdf
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 60>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Test content for encrypted PDF) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 4 0 R>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Pages/Count 1/Kids[3 0 R]>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Catalog/Pages 4 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000165 00000 n
|
||||
0000000294 00000 n
|
||||
0000000387 00000 n
|
||||
trailer
|
||||
<</Size 6/Root 5 0 R>>
|
||||
startxref
|
||||
0476
|
||||
%%EOF
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -6,639 +6,11 @@
|
|||
//! - All encrypted fixtures use user password "test" (NOT secret - these are test fixtures)
|
||||
//! - Owner password is empty string for all encrypted fixtures
|
||||
|
||||
use lopdf::{Dictionary, Object, Stream, Document, StringFormat};
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::process::Command;
|
||||
|
||||
fn create_minimal_page(content: &str) -> (Dictionary, Object) {
|
||||
let mut page_dict = Dictionary::new();
|
||||
page_dict.set(b"Type", "Page");
|
||||
page_dict.set(b"MediaBox", Object::Array(vec![
|
||||
Object::Real(0.0), Object::Real(0.0),
|
||||
Object::Real(612.0), Object::Real(792.0)
|
||||
]));
|
||||
|
||||
let mut font_dict = Dictionary::new();
|
||||
font_dict.set(b"Type", "Font");
|
||||
font_dict.set(b"Subtype", "Type1");
|
||||
font_dict.set(b"BaseFont", "Helvetica");
|
||||
|
||||
let mut resources = Dictionary::new();
|
||||
let mut fonts = Dictionary::new();
|
||||
fonts.set(b"F1", Object::Dictionary(font_dict));
|
||||
resources.set(b"Font", Object::Dictionary(fonts));
|
||||
page_dict.set(b"Resources", Object::Dictionary(resources));
|
||||
|
||||
let content_bytes = format!("BT\n/F1 12 Tf\n100 700 Td\n({}) Tj\nET\n", content);
|
||||
let mut stream_dict = Dictionary::new();
|
||||
stream_dict.set(b"Length", Object::Integer(content_bytes.len() as i64));
|
||||
let content_stream = Stream::new(stream_dict, content_bytes.as_bytes().to_vec());
|
||||
|
||||
(page_dict, Object::Stream(content_stream))
|
||||
}
|
||||
|
||||
fn create_simple_base_pdf() -> Document {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((3, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((4, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), content1);
|
||||
doc.objects.insert((4, 0).into(), content2);
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
let id = b"test-pdf-id-12345\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
doc.trailer.set(b"ID", Object::Array(vec![
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
]));
|
||||
|
||||
doc
|
||||
}
|
||||
|
||||
fn save_pdf(doc: &mut Document, filename: &str) {
|
||||
let mut buffer = Vec::new();
|
||||
doc.save_to(&mut buffer).unwrap();
|
||||
let mut file = File::create(filename).unwrap();
|
||||
file.write_all(&buffer).unwrap();
|
||||
}
|
||||
|
||||
fn encrypt_pdf(input: &str, output: &str, r_value: &str) {
|
||||
// Use qpdf to encrypt the PDF
|
||||
// R=2: RC4-40, R=3: RC4-128, R=4: AES-128, R=6: AES-256
|
||||
let result = Command::new("qpdf")
|
||||
.args(["--encrypt", "test", "", r_value, "--", input, output])
|
||||
.output();
|
||||
|
||||
match result {
|
||||
Ok(result) => {
|
||||
if result.status.success() {
|
||||
println!("Created {} (encrypted with R={}, password: 'test')", output, r_value);
|
||||
} else {
|
||||
eprintln!("qpdf failed: {}", String::from_utf8_lossy(&result.stderr));
|
||||
eprintln!("Copy {} manually and encrypt with qpdf", input);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("qpdf not found: {}. Copy {} manually and encrypt", e, input);
|
||||
// Copy the unencrypted version as fallback
|
||||
let _ = std::fs::copy(input, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_encrypted_rc4_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_rc4.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_rc4.pdf",
|
||||
"tests/document_model/fixtures/encrypted_rc4_test.pdf", "2");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_rc4.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_aes128_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_aes128.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_aes128.pdf",
|
||||
"tests/document_model/fixtures/encrypted_aes128_test.pdf", "4");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_aes128.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_aes256_pdf() {
|
||||
let mut doc = Document::with_version("2.0");
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((3, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((4, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), content1);
|
||||
doc.objects.insert((4, 0).into(), content2);
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
let id = b"test-pdf-id-12345\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
doc.trailer.set(b"ID", Object::Array(vec![
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
]));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_aes256.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_aes256.pdf",
|
||||
"tests/document_model/fixtures/encrypted_aes256_test.pdf", "6");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_aes256.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_empty_password_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_empty.pdf");
|
||||
// Empty password uses same command - qpdf treats empty owner password as ""
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_empty.pdf",
|
||||
"tests/document_model/fixtures/encrypted_empty_password.pdf", "2");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_empty.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_unknown_handler_pdf() {
|
||||
// For unsupported handler, create a simple PDF with a fake /Encrypt dict
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Get the PDF data
|
||||
let mut buffer = Vec::new();
|
||||
doc.save_to(&mut buffer).unwrap();
|
||||
let pdf_str = String::from_utf8_lossy(&buffer);
|
||||
|
||||
// Insert a custom encryption dict before the xref table
|
||||
let encrypt_dict = "1 0 obj\n<</Filter/Adobe.PubSec/V 2/R 2/Length 40/O(\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00)\n/U(\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00)\\nP -604>>\nendobj\n";
|
||||
|
||||
// Find the trailer
|
||||
let trailer_pos = pdf_str.find("trailer").unwrap_or(pdf_str.len());
|
||||
let mut result = pdf_str.to_string();
|
||||
result.insert_str(trailer_pos, encrypt_dict);
|
||||
result = result.replace("1 0 obj", "2 0 obj"); // Shift object numbers
|
||||
|
||||
// Add Encrypt reference to trailer
|
||||
result = result.replace("trailer\n<<", "trailer\n<</Encrypt 1 0 R");
|
||||
|
||||
let mut file = File::create("tests/document_model/fixtures/encrypted_unknown_handler.pdf").unwrap();
|
||||
file.write_all(result.as_bytes()).unwrap();
|
||||
println!("Created encrypted_unknown_handler.pdf (unsupported Adobe.PubSec handler)");
|
||||
}
|
||||
|
||||
fn create_tagged_3_level_outline_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Chapter 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Section 1.1");
|
||||
let (page3_dict, content3) = create_minimal_page("Subsection 1.1.1");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(3 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Reference((3, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((7, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((8, 0).into()));
|
||||
|
||||
let mut page3_dict = page3_dict;
|
||||
page3_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page3_dict.set(b"Contents", Object::Reference((9, 0).into()));
|
||||
|
||||
// Create outline hierarchy (3 levels)
|
||||
let mut outline1 = Dictionary::new();
|
||||
outline1.set(b"Title", Object::String(b"Chapter 1".to_vec(), StringFormat::Literal));
|
||||
outline1.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline1.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outline2 = Dictionary::new();
|
||||
outline2.set(b"Title", Object::String(b"Section 1.1".to_vec(), StringFormat::Literal));
|
||||
outline2.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline2.set(b"Prev", Object::Reference((11, 0).into()));
|
||||
outline2.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outline3 = Dictionary::new();
|
||||
outline3.set(b"Title", Object::String(b"Subsection 1.1.1".to_vec(), StringFormat::Literal));
|
||||
outline3.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline3.set(b"Prev", Object::Reference((12, 0).into()));
|
||||
outline3.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((3, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outlines = Dictionary::new();
|
||||
outlines.set(b"Type", "Outlines");
|
||||
outlines.set(b"Count", Object::Integer(3 as i64));
|
||||
outlines.set(b"First", Object::Reference((11, 0).into()));
|
||||
outlines.set(b"Last", Object::Reference((13, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"Outlines", Object::Reference((10, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(page3_dict));
|
||||
doc.objects.insert((7, 0).into(), content1);
|
||||
doc.objects.insert((8, 0).into(), content2);
|
||||
doc.objects.insert((9, 0).into(), content3);
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(outlines));
|
||||
doc.objects.insert((11, 0).into(), Object::Dictionary(outline1));
|
||||
doc.objects.insert((12, 0).into(), Object::Dictionary(outline2));
|
||||
doc.objects.insert((13, 0).into(), Object::Dictionary(outline3));
|
||||
doc.objects.insert((14, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((14, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/tagged_3_level_outline.pdf");
|
||||
println!("Created tagged_3_level_outline.pdf (3-level outline hierarchy)");
|
||||
}
|
||||
|
||||
fn create_ocg_default_off_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Create OCG (Optional Content Group)
|
||||
let mut ocg_dict = Dictionary::new();
|
||||
ocg_dict.set(b"Type", "OCG");
|
||||
ocg_dict.set(b"Name", Object::String(b"Test Layer".to_vec(), StringFormat::Literal));
|
||||
|
||||
// Create /OCProperties with /D /BaseState /OFF
|
||||
let mut default_config = Dictionary::new();
|
||||
default_config.set(b"BaseState", Object::Name(b"OFF".to_vec()));
|
||||
default_config.set(b"ON", Object::Array(vec![]));
|
||||
|
||||
let mut oc_properties = Dictionary::new();
|
||||
oc_properties.set(b"OCGs", Object::Array(vec![Object::Reference((6, 0).into())]));
|
||||
oc_properties.set(b"D", Object::Reference((7, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"OCProperties", Object::Reference((8, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(ocg_dict));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(default_config));
|
||||
doc.objects.insert((8, 0).into(), Object::Dictionary(oc_properties));
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/ocg_default_off.pdf");
|
||||
println!("Created ocg_default_off.pdf (OCG with /BaseState /OFF)");
|
||||
}
|
||||
|
||||
fn create_multi_revision_3_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/multi_revision_3.pdf");
|
||||
println!("Created multi_revision_3.pdf (normal PDF - for true multi-revision, use qpdf --linearize)");
|
||||
}
|
||||
|
||||
fn create_inheritance_grandparent_mediabox_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
// Create a 3-level /Pages tree where MediaBox is only on the grandparent
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((10, 0).into())]));
|
||||
pages_dict.set(b"MediaBox", Object::Array(vec![
|
||||
Object::Real(0.0), Object::Real(0.0),
|
||||
Object::Real(612.0), Object::Real(792.0)
|
||||
]));
|
||||
|
||||
let mut parent_pages = Dictionary::new();
|
||||
parent_pages.set(b"Type", "Pages");
|
||||
parent_pages.set(b"Count", Object::Integer(2 as i64));
|
||||
parent_pages.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((11, 0).into()));
|
||||
page1_dict.remove(b"MediaBox"); // No MediaBox - inherits
|
||||
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((12, 0).into()));
|
||||
page2_dict.remove(b"MediaBox"); // No MediaBox - inherits
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(parent_pages));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((11, 0).into(), content1);
|
||||
doc.objects.insert((12, 0).into(), content2);
|
||||
doc.objects.insert((13, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((13, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/inheritance_grandparent_mediabox.pdf");
|
||||
println!("Created inheritance_grandparent_mediabox.pdf (MediaBox from grandparent)");
|
||||
}
|
||||
|
||||
fn create_missing_mediabox_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(1 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((1, 0).into())]));
|
||||
|
||||
let mut page_dict = Dictionary::new();
|
||||
page_dict.set(b"Type", "Page");
|
||||
page_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
// No MediaBox - should trigger DEFAULT_MEDIABOX
|
||||
|
||||
let content_bytes = b"BT\n/F1 12 Tf\n100 700 Td\n(No MediaBox) Tj\nET\n";
|
||||
let mut stream_dict = Dictionary::new();
|
||||
stream_dict.set(b"Length", Object::Integer(content_bytes.len() as i64));
|
||||
let content_stream = Stream::new(stream_dict, content_bytes.to_vec());
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Stream(content_stream));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((3, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/missing_mediabox.pdf");
|
||||
println!("Created missing_mediabox.pdf (no MediaBox, defaults to US Letter)");
|
||||
}
|
||||
|
||||
fn create_partial_resource_override_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let mut root_resources = Dictionary::new();
|
||||
let mut root_fonts = Dictionary::new();
|
||||
root_fonts.set(b"F1", Object::Reference((4, 0).into()));
|
||||
root_fonts.set(b"F2", Object::Reference((5, 0).into()));
|
||||
let mut root_xobject = Dictionary::new();
|
||||
root_xobject.set(b"Im1", Object::Reference((6, 0).into()));
|
||||
root_resources.set(b"Font", Object::Dictionary(root_fonts));
|
||||
root_resources.set(b"XObject", Object::Dictionary(root_xobject));
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(1 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((1, 0).into())]));
|
||||
pages_dict.set(b"Resources", Object::Reference((10, 0).into()));
|
||||
|
||||
// Page overrides /Font but not /XObject
|
||||
let mut page_resources = Dictionary::new();
|
||||
let mut page_fonts = Dictionary::new();
|
||||
page_fonts.set(b"F1", Object::Reference((7, 0).into())); // Override F1
|
||||
page_fonts.set(b"F3", Object::Reference((8, 0).into())); // Add new font
|
||||
page_resources.set(b"Font", Object::Dictionary(page_fonts));
|
||||
// No /XObject - should inherit Im1 from parent
|
||||
|
||||
let (mut page_dict, content) = create_minimal_page("Partial Override");
|
||||
page_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page_dict.set(b"Contents", Object::Reference((11, 0).into()));
|
||||
page_dict.set(b"Resources", Object::Dictionary(page_resources));
|
||||
|
||||
let mut font1 = Dictionary::new();
|
||||
font1.set(b"Type", "Font");
|
||||
font1.set(b"Subtype", "Type1");
|
||||
font1.set(b"BaseFont", "Helvetica");
|
||||
|
||||
let mut font2 = Dictionary::new();
|
||||
font2.set(b"Type", "Font");
|
||||
font2.set(b"Subtype", "Type1");
|
||||
font2.set(b"BaseFont", "Times-Roman");
|
||||
|
||||
let mut font3 = Dictionary::new();
|
||||
font3.set(b"Type", "Font");
|
||||
font3.set(b"Subtype", "Type1");
|
||||
font3.set(b"BaseFont", "Courier");
|
||||
|
||||
let mut image = Dictionary::new();
|
||||
image.set(b"Type", "XObject");
|
||||
image.set(b"Subtype", "Image");
|
||||
image.set(b"Width", Object::Integer(100 as i64));
|
||||
image.set(b"Height", Object::Integer(100 as i64));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page_dict));
|
||||
doc.objects.insert((4, 0).into(), Object::Dictionary(font1.clone()));
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(font2));
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(image));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(font1)); // Overridden F1
|
||||
doc.objects.insert((8, 0).into(), Object::Dictionary(font3));
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(root_resources));
|
||||
doc.objects.insert((11, 0).into(), content);
|
||||
doc.objects.insert((12, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((12, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/partial_resource_override.pdf");
|
||||
println!("Created partial_resource_override.pdf (partial /Resources override)");
|
||||
}
|
||||
|
||||
fn create_js_in_openaction_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let mut open_action = Dictionary::new();
|
||||
open_action.set(b"S", "JavaScript");
|
||||
open_action.set(b"JS", Object::String(b"app.alert('Hello from PDF!');".to_vec(), StringFormat::Literal));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"OpenAction", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(open_action));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/js_in_openaction.pdf");
|
||||
println!("Created js_in_openaction.pdf (/OpenAction /S /JavaScript)");
|
||||
}
|
||||
|
||||
fn create_xfa_form_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let mut acroform = Dictionary::new();
|
||||
acroform.set(b"XFA", Object::String(b"template".to_vec(), StringFormat::Literal));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"AcroForm", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(acroform));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/xfa_form.pdf");
|
||||
println!("Created xfa_form.pdf (/AcroForm /XFA present)");
|
||||
}
|
||||
|
||||
fn create_pdfa_1b_conformance_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let xmp_metadata = r#"<?xpacket begin="?" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.6-c140 79.160451">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""
|
||||
xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
|
||||
<pdfaid:part>1</pdfaid:part>
|
||||
<pdfaid:conformance>B</pdfaid:conformance>
|
||||
</rdf:Description>
|
||||
</rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
<?xpacket end="w"?>"#;
|
||||
|
||||
let mut metadata_dict = Dictionary::new();
|
||||
metadata_dict.set(b"Type", "Metadata");
|
||||
metadata_dict.set(b"Subtype", "XML");
|
||||
let metadata_stream = Stream::new(metadata_dict, xmp_metadata.as_bytes().to_vec());
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"Metadata", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Stream(metadata_stream));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/pdfa_1b_conformance.pdf");
|
||||
println!("Created pdfa_1b_conformance.pdf (XMP PDF/A-1B metadata)");
|
||||
}
|
||||
|
||||
fn create_page_labels_roman_arabic_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Add page 3 and 4
|
||||
let (page3_dict, content3) = create_minimal_page("Page 3");
|
||||
let (page4_dict, content4) = create_minimal_page("Page 4");
|
||||
let mut page3_dict = page3_dict;
|
||||
page3_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page3_dict.set(b"Contents", Object::Reference((8, 0).into()));
|
||||
let mut page4_dict = page4_dict;
|
||||
page4_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page4_dict.set(b"Contents", Object::Reference((9, 0).into()));
|
||||
|
||||
// Add /PageLabels number tree
|
||||
// Pages 0-3: roman numerals (i, ii, iii, iv)
|
||||
// Pages 4+: arabic (1, 2, 3, ...)
|
||||
let mut page_labels = Dictionary::new();
|
||||
page_labels.set(b"Nums", Object::Array(vec![
|
||||
Object::Integer(0 as i64),
|
||||
Object::Dictionary({
|
||||
let mut d = Dictionary::new();
|
||||
d.set(b"S", "r");
|
||||
d.set(b"St", Object::Integer(1 as i64));
|
||||
d
|
||||
}),
|
||||
Object::Integer(4 as i64),
|
||||
Object::Dictionary({
|
||||
let mut d = Dictionary::new();
|
||||
d.set(b"S", "D");
|
||||
d.set(b"St", Object::Integer(1 as i64));
|
||||
d
|
||||
})
|
||||
]));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"PageLabels", Object::Reference((10, 0).into()));
|
||||
|
||||
// Update pages count to 4
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(4 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Reference((3, 0).into()),
|
||||
Object::Reference((4, 0).into())
|
||||
]));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(page3_dict));
|
||||
doc.objects.insert((4, 0).into(), Object::Dictionary(page4_dict));
|
||||
doc.objects.insert((8, 0).into(), content3);
|
||||
doc.objects.insert((9, 0).into(), content4);
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(page_labels));
|
||||
doc.objects.insert((11, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((11, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/page_labels_roman_arabic.pdf");
|
||||
println!("Created page_labels_roman_arabic.pdf (roman 0-3, arabic 4+)");
|
||||
}
|
||||
// NOTE: This fixture generator is disabled - lopdf is no longer a dependency.
|
||||
// Use existing fixture files or regenerate with a different tool.
|
||||
|
||||
fn main() {
|
||||
println!("Generating document-model test fixtures...");
|
||||
|
||||
create_encrypted_rc4_pdf();
|
||||
create_encrypted_aes128_pdf();
|
||||
create_encrypted_aes256_pdf();
|
||||
create_encrypted_empty_password_pdf();
|
||||
create_encrypted_unknown_handler_pdf();
|
||||
create_tagged_3_level_outline_pdf();
|
||||
create_ocg_default_off_pdf();
|
||||
create_multi_revision_3_pdf();
|
||||
create_inheritance_grandparent_mediabox_pdf();
|
||||
create_missing_mediabox_pdf();
|
||||
create_partial_resource_override_pdf();
|
||||
create_js_in_openaction_pdf();
|
||||
create_xfa_form_pdf();
|
||||
create_pdfa_1b_conformance_pdf();
|
||||
create_page_labels_roman_arabic_pdf();
|
||||
|
||||
println!("\nAll 15 document-model fixtures generated successfully!");
|
||||
println!("\nNote: Encrypted fixtures require qpdf to be installed.");
|
||||
println!("If qpdf is not available, encrypted fixtures will be unencrypted placeholders.");
|
||||
eprintln!("Fixture generator is disabled - lopdf is no longer a dependency.");
|
||||
eprintln!("Use existing fixture files in tests/document_model/fixtures/");
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
|
|
|||
653
tests/document_model/fixtures/generate_fixtures.rs.disabled
Normal file
653
tests/document_model/fixtures/generate_fixtures.rs.disabled
Normal file
|
|
@ -0,0 +1,653 @@
|
|||
//! Generate document-model test fixtures.
|
||||
//!
|
||||
//! This program creates 15 PDF test fixtures for document model integration tests.
|
||||
//!
|
||||
//! FIXTURE PASSWORDS:
|
||||
//! - All encrypted fixtures use user password "test" (NOT secret - these are test fixtures)
|
||||
//! - Owner password is empty string for all encrypted fixtures
|
||||
|
||||
// NOTE: lopdf is no longer a dependency. This fixture generator is disabled.
|
||||
// Use existing fixture files or regenerate with a different tool.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::process::Command;
|
||||
|
||||
// Stub types to allow compilation
|
||||
type Dictionary = ();
|
||||
type Object = ();
|
||||
type Stream = ();
|
||||
type Document = ();
|
||||
struct StringFormat;
|
||||
|
||||
fn create_minimal_page(content: &str) -> (Dictionary, Object) {
|
||||
let mut page_dict = Dictionary::new();
|
||||
page_dict.set(b"Type", "Page");
|
||||
page_dict.set(b"MediaBox", Object::Array(vec![
|
||||
Object::Real(0.0), Object::Real(0.0),
|
||||
Object::Real(612.0), Object::Real(792.0)
|
||||
]));
|
||||
|
||||
let mut font_dict = Dictionary::new();
|
||||
font_dict.set(b"Type", "Font");
|
||||
font_dict.set(b"Subtype", "Type1");
|
||||
font_dict.set(b"BaseFont", "Helvetica");
|
||||
|
||||
let mut resources = Dictionary::new();
|
||||
let mut fonts = Dictionary::new();
|
||||
fonts.set(b"F1", Object::Dictionary(font_dict));
|
||||
resources.set(b"Font", Object::Dictionary(fonts));
|
||||
page_dict.set(b"Resources", Object::Dictionary(resources));
|
||||
|
||||
let content_bytes = format!("BT\n/F1 12 Tf\n100 700 Td\n({}) Tj\nET\n", content);
|
||||
let mut stream_dict = Dictionary::new();
|
||||
stream_dict.set(b"Length", Object::Integer(content_bytes.len() as i64));
|
||||
let content_stream = Stream::new(stream_dict, content_bytes.as_bytes().to_vec());
|
||||
|
||||
(page_dict, Object::Stream(content_stream))
|
||||
}
|
||||
|
||||
fn create_simple_base_pdf() -> Document {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((3, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((4, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), content1);
|
||||
doc.objects.insert((4, 0).into(), content2);
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
let id = b"test-pdf-id-12345\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
doc.trailer.set(b"ID", Object::Array(vec![
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
]));
|
||||
|
||||
doc
|
||||
}
|
||||
|
||||
fn save_pdf(doc: &mut Document, filename: &str) {
|
||||
let mut buffer = Vec::new();
|
||||
doc.save_to(&mut buffer).unwrap();
|
||||
let mut file = File::create(filename).unwrap();
|
||||
file.write_all(&buffer).unwrap();
|
||||
}
|
||||
|
||||
fn encrypt_pdf(input: &str, output: &str, r_value: &str) {
|
||||
// Use qpdf to encrypt the PDF
|
||||
// R=2: RC4-40, R=3: RC4-128, R=4: AES-128, R=6: AES-256
|
||||
let result = Command::new("qpdf")
|
||||
.args(["--encrypt", "test", "", r_value, "--", input, output])
|
||||
.output();
|
||||
|
||||
match result {
|
||||
Ok(result) => {
|
||||
if result.status.success() {
|
||||
println!("Created {} (encrypted with R={}, password: 'test')", output, r_value);
|
||||
} else {
|
||||
eprintln!("qpdf failed: {}", String::from_utf8_lossy(&result.stderr));
|
||||
eprintln!("Copy {} manually and encrypt with qpdf", input);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("qpdf not found: {}. Copy {} manually and encrypt", e, input);
|
||||
// Copy the unencrypted version as fallback
|
||||
let _ = std::fs::copy(input, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_encrypted_rc4_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_rc4.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_rc4.pdf",
|
||||
"tests/document_model/fixtures/encrypted_rc4_test.pdf", "2");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_rc4.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_aes128_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_aes128.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_aes128.pdf",
|
||||
"tests/document_model/fixtures/encrypted_aes128_test.pdf", "4");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_aes128.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_aes256_pdf() {
|
||||
let mut doc = Document::with_version("2.0");
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((3, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((4, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), content1);
|
||||
doc.objects.insert((4, 0).into(), content2);
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
let id = b"test-pdf-id-12345\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
doc.trailer.set(b"ID", Object::Array(vec![
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
Object::String(id.to_vec(), StringFormat::Literal),
|
||||
]));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_aes256.pdf");
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_aes256.pdf",
|
||||
"tests/document_model/fixtures/encrypted_aes256_test.pdf", "6");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_aes256.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_empty_password_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/_temp_empty.pdf");
|
||||
// Empty password uses same command - qpdf treats empty owner password as ""
|
||||
encrypt_pdf("tests/document_model/fixtures/_temp_empty.pdf",
|
||||
"tests/document_model/fixtures/encrypted_empty_password.pdf", "2");
|
||||
let _ = std::fs::remove_file("tests/document_model/fixtures/_temp_empty.pdf");
|
||||
}
|
||||
|
||||
fn create_encrypted_unknown_handler_pdf() {
|
||||
// For unsupported handler, create a simple PDF with a fake /Encrypt dict
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Get the PDF data
|
||||
let mut buffer = Vec::new();
|
||||
doc.save_to(&mut buffer).unwrap();
|
||||
let pdf_str = String::from_utf8_lossy(&buffer);
|
||||
|
||||
// Insert a custom encryption dict before the xref table
|
||||
let encrypt_dict = "1 0 obj\n<</Filter/Adobe.PubSec/V 2/R 2/Length 40/O(\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00)\n/U(\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00)\\nP -604>>\nendobj\n";
|
||||
|
||||
// Find the trailer
|
||||
let trailer_pos = pdf_str.find("trailer").unwrap_or(pdf_str.len());
|
||||
let mut result = pdf_str.to_string();
|
||||
result.insert_str(trailer_pos, encrypt_dict);
|
||||
result = result.replace("1 0 obj", "2 0 obj"); // Shift object numbers
|
||||
|
||||
// Add Encrypt reference to trailer
|
||||
result = result.replace("trailer\n<<", "trailer\n<</Encrypt 1 0 R");
|
||||
|
||||
let mut file = File::create("tests/document_model/fixtures/encrypted_unknown_handler.pdf").unwrap();
|
||||
file.write_all(result.as_bytes()).unwrap();
|
||||
println!("Created encrypted_unknown_handler.pdf (unsupported Adobe.PubSec handler)");
|
||||
}
|
||||
|
||||
fn create_tagged_3_level_outline_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Chapter 1");
|
||||
let (page2_dict, content2) = create_minimal_page("Section 1.1");
|
||||
let (page3_dict, content3) = create_minimal_page("Subsection 1.1.1");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(3 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Reference((3, 0).into())
|
||||
]));
|
||||
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((7, 0).into()));
|
||||
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((8, 0).into()));
|
||||
|
||||
let mut page3_dict = page3_dict;
|
||||
page3_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page3_dict.set(b"Contents", Object::Reference((9, 0).into()));
|
||||
|
||||
// Create outline hierarchy (3 levels)
|
||||
let mut outline1 = Dictionary::new();
|
||||
outline1.set(b"Title", Object::String(b"Chapter 1".to_vec(), StringFormat::Literal));
|
||||
outline1.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline1.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outline2 = Dictionary::new();
|
||||
outline2.set(b"Title", Object::String(b"Section 1.1".to_vec(), StringFormat::Literal));
|
||||
outline2.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline2.set(b"Prev", Object::Reference((11, 0).into()));
|
||||
outline2.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outline3 = Dictionary::new();
|
||||
outline3.set(b"Title", Object::String(b"Subsection 1.1.1".to_vec(), StringFormat::Literal));
|
||||
outline3.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
outline3.set(b"Prev", Object::Reference((12, 0).into()));
|
||||
outline3.set(b"Dest", Object::Array(vec![
|
||||
Object::Reference((3, 0).into()),
|
||||
Object::Name(b"Fit".to_vec())
|
||||
]));
|
||||
|
||||
let mut outlines = Dictionary::new();
|
||||
outlines.set(b"Type", "Outlines");
|
||||
outlines.set(b"Count", Object::Integer(3 as i64));
|
||||
outlines.set(b"First", Object::Reference((11, 0).into()));
|
||||
outlines.set(b"Last", Object::Reference((13, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"Outlines", Object::Reference((10, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(page3_dict));
|
||||
doc.objects.insert((7, 0).into(), content1);
|
||||
doc.objects.insert((8, 0).into(), content2);
|
||||
doc.objects.insert((9, 0).into(), content3);
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(outlines));
|
||||
doc.objects.insert((11, 0).into(), Object::Dictionary(outline1));
|
||||
doc.objects.insert((12, 0).into(), Object::Dictionary(outline2));
|
||||
doc.objects.insert((13, 0).into(), Object::Dictionary(outline3));
|
||||
doc.objects.insert((14, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((14, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/tagged_3_level_outline.pdf");
|
||||
println!("Created tagged_3_level_outline.pdf (3-level outline hierarchy)");
|
||||
}
|
||||
|
||||
fn create_ocg_default_off_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Create OCG (Optional Content Group)
|
||||
let mut ocg_dict = Dictionary::new();
|
||||
ocg_dict.set(b"Type", "OCG");
|
||||
ocg_dict.set(b"Name", Object::String(b"Test Layer".to_vec(), StringFormat::Literal));
|
||||
|
||||
// Create /OCProperties with /D /BaseState /OFF
|
||||
let mut default_config = Dictionary::new();
|
||||
default_config.set(b"BaseState", Object::Name(b"OFF".to_vec()));
|
||||
default_config.set(b"ON", Object::Array(vec![]));
|
||||
|
||||
let mut oc_properties = Dictionary::new();
|
||||
oc_properties.set(b"OCGs", Object::Array(vec![Object::Reference((6, 0).into())]));
|
||||
oc_properties.set(b"D", Object::Reference((7, 0).into()));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"OCProperties", Object::Reference((8, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(ocg_dict));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(default_config));
|
||||
doc.objects.insert((8, 0).into(), Object::Dictionary(oc_properties));
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((5, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/ocg_default_off.pdf");
|
||||
println!("Created ocg_default_off.pdf (OCG with /BaseState /OFF)");
|
||||
}
|
||||
|
||||
fn create_multi_revision_3_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/multi_revision_3.pdf");
|
||||
println!("Created multi_revision_3.pdf (normal PDF - for true multi-revision, use qpdf --linearize)");
|
||||
}
|
||||
|
||||
fn create_inheritance_grandparent_mediabox_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
// Create a 3-level /Pages tree where MediaBox is only on the grandparent
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(2 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((10, 0).into())]));
|
||||
pages_dict.set(b"MediaBox", Object::Array(vec![
|
||||
Object::Real(0.0), Object::Real(0.0),
|
||||
Object::Real(612.0), Object::Real(792.0)
|
||||
]));
|
||||
|
||||
let mut parent_pages = Dictionary::new();
|
||||
parent_pages.set(b"Type", "Pages");
|
||||
parent_pages.set(b"Count", Object::Integer(2 as i64));
|
||||
parent_pages.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into())
|
||||
]));
|
||||
|
||||
let (page1_dict, content1) = create_minimal_page("Page 1");
|
||||
let mut page1_dict = page1_dict;
|
||||
page1_dict.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
page1_dict.set(b"Contents", Object::Reference((11, 0).into()));
|
||||
page1_dict.remove(b"MediaBox"); // No MediaBox - inherits
|
||||
|
||||
let (page2_dict, content2) = create_minimal_page("Page 2");
|
||||
let mut page2_dict = page2_dict;
|
||||
page2_dict.set(b"Parent", Object::Reference((10, 0).into()));
|
||||
page2_dict.set(b"Contents", Object::Reference((12, 0).into()));
|
||||
page2_dict.remove(b"MediaBox"); // No MediaBox - inherits
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(parent_pages));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page1_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Dictionary(page2_dict));
|
||||
doc.objects.insert((11, 0).into(), content1);
|
||||
doc.objects.insert((12, 0).into(), content2);
|
||||
doc.objects.insert((13, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((13, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/inheritance_grandparent_mediabox.pdf");
|
||||
println!("Created inheritance_grandparent_mediabox.pdf (MediaBox from grandparent)");
|
||||
}
|
||||
|
||||
fn create_missing_mediabox_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(1 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((1, 0).into())]));
|
||||
|
||||
let mut page_dict = Dictionary::new();
|
||||
page_dict.set(b"Type", "Page");
|
||||
page_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
// No MediaBox - should trigger DEFAULT_MEDIABOX
|
||||
|
||||
let content_bytes = b"BT\n/F1 12 Tf\n100 700 Td\n(No MediaBox) Tj\nET\n";
|
||||
let mut stream_dict = Dictionary::new();
|
||||
stream_dict.set(b"Length", Object::Integer(content_bytes.len() as i64));
|
||||
let content_stream = Stream::new(stream_dict, content_bytes.to_vec());
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page_dict));
|
||||
doc.objects.insert((2, 0).into(), Object::Stream(content_stream));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((3, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/missing_mediabox.pdf");
|
||||
println!("Created missing_mediabox.pdf (no MediaBox, defaults to US Letter)");
|
||||
}
|
||||
|
||||
fn create_partial_resource_override_pdf() {
|
||||
let mut doc = Document::with_version("1.4");
|
||||
|
||||
let mut root_resources = Dictionary::new();
|
||||
let mut root_fonts = Dictionary::new();
|
||||
root_fonts.set(b"F1", Object::Reference((4, 0).into()));
|
||||
root_fonts.set(b"F2", Object::Reference((5, 0).into()));
|
||||
let mut root_xobject = Dictionary::new();
|
||||
root_xobject.set(b"Im1", Object::Reference((6, 0).into()));
|
||||
root_resources.set(b"Font", Object::Dictionary(root_fonts));
|
||||
root_resources.set(b"XObject", Object::Dictionary(root_xobject));
|
||||
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(1 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![Object::Reference((1, 0).into())]));
|
||||
pages_dict.set(b"Resources", Object::Reference((10, 0).into()));
|
||||
|
||||
// Page overrides /Font but not /XObject
|
||||
let mut page_resources = Dictionary::new();
|
||||
let mut page_fonts = Dictionary::new();
|
||||
page_fonts.set(b"F1", Object::Reference((7, 0).into())); // Override F1
|
||||
page_fonts.set(b"F3", Object::Reference((8, 0).into())); // Add new font
|
||||
page_resources.set(b"Font", Object::Dictionary(page_fonts));
|
||||
// No /XObject - should inherit Im1 from parent
|
||||
|
||||
let (mut page_dict, content) = create_minimal_page("Partial Override");
|
||||
page_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page_dict.set(b"Contents", Object::Reference((11, 0).into()));
|
||||
page_dict.set(b"Resources", Object::Dictionary(page_resources));
|
||||
|
||||
let mut font1 = Dictionary::new();
|
||||
font1.set(b"Type", "Font");
|
||||
font1.set(b"Subtype", "Type1");
|
||||
font1.set(b"BaseFont", "Helvetica");
|
||||
|
||||
let mut font2 = Dictionary::new();
|
||||
font2.set(b"Type", "Font");
|
||||
font2.set(b"Subtype", "Type1");
|
||||
font2.set(b"BaseFont", "Times-Roman");
|
||||
|
||||
let mut font3 = Dictionary::new();
|
||||
font3.set(b"Type", "Font");
|
||||
font3.set(b"Subtype", "Type1");
|
||||
font3.set(b"BaseFont", "Courier");
|
||||
|
||||
let mut image = Dictionary::new();
|
||||
image.set(b"Type", "XObject");
|
||||
image.set(b"Subtype", "Image");
|
||||
image.set(b"Width", Object::Integer(100 as i64));
|
||||
image.set(b"Height", Object::Integer(100 as i64));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((1, 0).into(), Object::Dictionary(page_dict));
|
||||
doc.objects.insert((4, 0).into(), Object::Dictionary(font1.clone()));
|
||||
doc.objects.insert((5, 0).into(), Object::Dictionary(font2));
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(image));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(font1)); // Overridden F1
|
||||
doc.objects.insert((8, 0).into(), Object::Dictionary(font3));
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(root_resources));
|
||||
doc.objects.insert((11, 0).into(), content);
|
||||
doc.objects.insert((12, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((12, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/partial_resource_override.pdf");
|
||||
println!("Created partial_resource_override.pdf (partial /Resources override)");
|
||||
}
|
||||
|
||||
fn create_js_in_openaction_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let mut open_action = Dictionary::new();
|
||||
open_action.set(b"S", "JavaScript");
|
||||
open_action.set(b"JS", Object::String(b"app.alert('Hello from PDF!');".to_vec(), StringFormat::Literal));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"OpenAction", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(open_action));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/js_in_openaction.pdf");
|
||||
println!("Created js_in_openaction.pdf (/OpenAction /S /JavaScript)");
|
||||
}
|
||||
|
||||
fn create_xfa_form_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let mut acroform = Dictionary::new();
|
||||
acroform.set(b"XFA", Object::String(b"template".to_vec(), StringFormat::Literal));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"AcroForm", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Dictionary(acroform));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/xfa_form.pdf");
|
||||
println!("Created xfa_form.pdf (/AcroForm /XFA present)");
|
||||
}
|
||||
|
||||
fn create_pdfa_1b_conformance_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
let xmp_metadata = r#"<?xpacket begin="?" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.6-c140 79.160451">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""
|
||||
xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
|
||||
<pdfaid:part>1</pdfaid:part>
|
||||
<pdfaid:conformance>B</pdfaid:conformance>
|
||||
</rdf:Description>
|
||||
</rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
<?xpacket end="w"?>"#;
|
||||
|
||||
let mut metadata_dict = Dictionary::new();
|
||||
metadata_dict.set(b"Type", "Metadata");
|
||||
metadata_dict.set(b"Subtype", "XML");
|
||||
let metadata_stream = Stream::new(metadata_dict, xmp_metadata.as_bytes().to_vec());
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"Metadata", Object::Reference((6, 0).into()));
|
||||
|
||||
doc.objects.insert((6, 0).into(), Object::Stream(metadata_stream));
|
||||
doc.objects.insert((7, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((7, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/pdfa_1b_conformance.pdf");
|
||||
println!("Created pdfa_1b_conformance.pdf (XMP PDF/A-1B metadata)");
|
||||
}
|
||||
|
||||
fn create_page_labels_roman_arabic_pdf() {
|
||||
let mut doc = create_simple_base_pdf();
|
||||
|
||||
// Add page 3 and 4
|
||||
let (page3_dict, content3) = create_minimal_page("Page 3");
|
||||
let (page4_dict, content4) = create_minimal_page("Page 4");
|
||||
let mut page3_dict = page3_dict;
|
||||
page3_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page3_dict.set(b"Contents", Object::Reference((8, 0).into()));
|
||||
let mut page4_dict = page4_dict;
|
||||
page4_dict.set(b"Parent", Object::Reference((0, 0).into()));
|
||||
page4_dict.set(b"Contents", Object::Reference((9, 0).into()));
|
||||
|
||||
// Add /PageLabels number tree
|
||||
// Pages 0-3: roman numerals (i, ii, iii, iv)
|
||||
// Pages 4+: arabic (1, 2, 3, ...)
|
||||
let mut page_labels = Dictionary::new();
|
||||
page_labels.set(b"Nums", Object::Array(vec![
|
||||
Object::Integer(0 as i64),
|
||||
Object::Dictionary({
|
||||
let mut d = Dictionary::new();
|
||||
d.set(b"S", "r");
|
||||
d.set(b"St", Object::Integer(1 as i64));
|
||||
d
|
||||
}),
|
||||
Object::Integer(4 as i64),
|
||||
Object::Dictionary({
|
||||
let mut d = Dictionary::new();
|
||||
d.set(b"S", "D");
|
||||
d.set(b"St", Object::Integer(1 as i64));
|
||||
d
|
||||
})
|
||||
]));
|
||||
|
||||
let mut catalog_dict = Dictionary::new();
|
||||
catalog_dict.set(b"Type", "Catalog");
|
||||
catalog_dict.set(b"Pages", Object::Reference((0, 0).into()));
|
||||
catalog_dict.set(b"PageLabels", Object::Reference((10, 0).into()));
|
||||
|
||||
// Update pages count to 4
|
||||
let mut pages_dict = Dictionary::new();
|
||||
pages_dict.set(b"Type", "Pages");
|
||||
pages_dict.set(b"Count", Object::Integer(4 as i64));
|
||||
pages_dict.set(b"Kids", Object::Array(vec![
|
||||
Object::Reference((1, 0).into()),
|
||||
Object::Reference((2, 0).into()),
|
||||
Object::Reference((3, 0).into()),
|
||||
Object::Reference((4, 0).into())
|
||||
]));
|
||||
|
||||
doc.objects.insert((0, 0).into(), Object::Dictionary(pages_dict));
|
||||
doc.objects.insert((3, 0).into(), Object::Dictionary(page3_dict));
|
||||
doc.objects.insert((4, 0).into(), Object::Dictionary(page4_dict));
|
||||
doc.objects.insert((8, 0).into(), content3);
|
||||
doc.objects.insert((9, 0).into(), content4);
|
||||
doc.objects.insert((10, 0).into(), Object::Dictionary(page_labels));
|
||||
doc.objects.insert((11, 0).into(), Object::Dictionary(catalog_dict));
|
||||
doc.trailer.set(b"Root", Object::Reference((11, 0)));
|
||||
|
||||
save_pdf(&mut doc, "tests/document_model/fixtures/page_labels_roman_arabic.pdf");
|
||||
println!("Created page_labels_roman_arabic.pdf (roman 0-3, arabic 4+)");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("Generating document-model test fixtures...");
|
||||
|
||||
create_encrypted_rc4_pdf();
|
||||
create_encrypted_aes128_pdf();
|
||||
create_encrypted_aes256_pdf();
|
||||
create_encrypted_empty_password_pdf();
|
||||
create_encrypted_unknown_handler_pdf();
|
||||
create_tagged_3_level_outline_pdf();
|
||||
create_ocg_default_off_pdf();
|
||||
create_multi_revision_3_pdf();
|
||||
create_inheritance_grandparent_mediabox_pdf();
|
||||
create_missing_mediabox_pdf();
|
||||
create_partial_resource_override_pdf();
|
||||
create_js_in_openaction_pdf();
|
||||
create_xfa_form_pdf();
|
||||
create_pdfa_1b_conformance_pdf();
|
||||
create_page_labels_roman_arabic_pdf();
|
||||
|
||||
println!("\nAll 15 document-model fixtures generated successfully!");
|
||||
println!("\nNote: Encrypted fixtures require qpdf to be installed.");
|
||||
println!("If qpdf is not available, encrypted fixtures will be unencrypted placeholders.");
|
||||
}
|
||||
BIN
tests/document_model/fixtures/generate_fixtures_new
Executable file
BIN
tests/document_model/fixtures/generate_fixtures_new
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
675
tests/document_model/fixtures/src/main.rs
Normal file
675
tests/document_model/fixtures/src/main.rs
Normal file
|
|
@ -0,0 +1,675 @@
|
|||
//! Generate valid minimal PDF fixtures for document model testing.
|
||||
//!
|
||||
//! FIXTURE PASSWORDS:
|
||||
//! - All encrypted fixtures use user password "test" (NOT secret - these are test fixtures)
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
|
||||
fn main() {
|
||||
println!("Generating document-model test fixtures...");
|
||||
|
||||
generate_all_fixtures();
|
||||
|
||||
println!("\nAll fixtures generated!");
|
||||
println!("Note: Encrypted fixtures need to be manually encrypted with qpdf or similar tool.");
|
||||
}
|
||||
|
||||
fn generate_all_fixtures() {
|
||||
create_encrypted_rc4_base();
|
||||
create_tagged_3_level_outline();
|
||||
create_ocg_default_off();
|
||||
create_multi_revision_3();
|
||||
create_inheritance_grandparent_mediabox();
|
||||
create_missing_mediabox();
|
||||
create_partial_resource_override();
|
||||
create_js_in_openaction();
|
||||
create_xfa_form();
|
||||
create_pdfa_1b_conformance();
|
||||
create_page_labels_roman_arabic();
|
||||
}
|
||||
|
||||
/// Create base PDF for RC4 encryption (will be encrypted later with qpdf)
|
||||
fn create_encrypted_rc4_base() {
|
||||
let pdf = minimal_pdf("Hello Encrypted", "Test content for encrypted PDF");
|
||||
write_pdf("tests/document_model/fixtures/_temp_enc_rc4.pdf", &pdf);
|
||||
println!("Created _temp_enc_rc4.pdf (encrypt with: qpdf --encrypt test '' 2 -- _temp_enc_rc4.pdf encrypted_rc4_test.pdf)");
|
||||
}
|
||||
|
||||
/// Create a 3-level outline fixture
|
||||
fn create_tagged_3_level_outline() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 44>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Chapter 1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Length 47>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Section 1.1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Length 56>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Subsection 1.1.1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 3/Kids[6 0 R 7 0 R 8 0 R]/MediaBox[0 0 612 792]/Resources<</Font<</F1 1 0 R>>>>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 2 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 3 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 4 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<</Title(Chapter 1)/Parent 11 0 R/Dest[6 0 R /Fit]>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<</Title(Section 1.1)/Parent 11 0 R/Prev 9 0 R/Dest[7 0 R /Fit]>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<</Title(Subsection 1.1.1)/Parent 11 0 R/Prev 10 0 R/Dest[8 0 R /Fit]>>
|
||||
endobj
|
||||
12 0 obj
|
||||
<</Type/Outlines/First 9 0 R/Last 11 0 R/Count 3>>
|
||||
endobj
|
||||
13 0 obj
|
||||
<</Type/Catalog/Pages 5 0 R/Outlines 12 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 14
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000137 00000 n
|
||||
0000000216 00000 n
|
||||
0000000295 00000 n
|
||||
0000000466 00000 n
|
||||
0000000569 00000 n
|
||||
0000000672 00000 n
|
||||
0000000775 00000 n
|
||||
0000000890 00000 n
|
||||
0000001005 00000 n
|
||||
0000001120 00000 n
|
||||
0000001219 00000 n
|
||||
trailer
|
||||
<</Size 14/Root 13 0 R>>
|
||||
startxref
|
||||
1318
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/tagged_3_level_outline.pdf", &pdf);
|
||||
println!("Created tagged_3_level_outline.pdf (3-level outline hierarchy)");
|
||||
}
|
||||
|
||||
/// Create OCG with /BaseState /OFF
|
||||
fn create_ocg_default_off() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.5
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 35>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Test) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/OCG/Name(Test Layer)>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</BaseState/OFF/ON[]>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</OCGs[3 0 R]/D 4 0 R/Present true>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 7 0 R>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type/Pages/Count 1/Kids[6 0 R]>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Catalog/Pages 7 0 R/OCProperties 5 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000137 00000 n
|
||||
0000000196 00000 n
|
||||
0000000229 00000 n
|
||||
0000000310 00000 n
|
||||
0000000469 00000 n
|
||||
0000000522 00000 n
|
||||
trailer
|
||||
<</Size 9/Root 8 0 R>>
|
||||
startxref
|
||||
629
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/ocg_default_off.pdf", &pdf);
|
||||
println!("Created ocg_default_off.pdf (OCG with /BaseState /OFF)");
|
||||
}
|
||||
|
||||
/// Create a 3-page PDF for multi-revision testing (base version)
|
||||
fn create_multi_revision_3() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Rev 1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Rev 2) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Rev 3) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 3/Kids[6 0 R 7 0 R 8 0 R]/MediaBox[0 0 612 792]/Resources<</Font<</F1 1 0 R>>>>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 2 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 3 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 4 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<</Type/Catalog/Pages 5 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 10
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000135 00000 n
|
||||
0000000208 00000 n
|
||||
0000000281 00000 n
|
||||
0000000452 00000 n
|
||||
0000000555 00000 n
|
||||
0000000658 00000 n
|
||||
0000000761 00000 n
|
||||
trailer
|
||||
<</Size 10/Root 9 0 R>>
|
||||
startxref
|
||||
864
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/multi_revision_3.pdf", &pdf);
|
||||
println!("Created multi_revision_3.pdf (base 3-page PDF)");
|
||||
}
|
||||
|
||||
/// Create MediaBox inheritance from grandparent /Pages node
|
||||
fn create_inheritance_grandparent_mediabox() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page 1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page 2) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Pages/Count 2/Kids[5 0 R]/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 2/Kids[6 0 R 7 0 R]/Parent 4 0 R/Resources<</Font<</F1 1 0 R>>>>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 2 0 R>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type/Page/Parent 5 0 R/Contents 3 0 R>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Catalog/Pages 4 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000135 00000 n
|
||||
0000000208 00000 n
|
||||
0000000289 00000 n
|
||||
0000000474 00000 n
|
||||
0000000569 00000 n
|
||||
0000000664 00000 n
|
||||
trailer
|
||||
<</Size 9/Root 8 0 R>>
|
||||
startxref
|
||||
767
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/inheritance_grandparent_mediabox.pdf", &pdf);
|
||||
println!("Created inheritance_grandparent_mediabox.pdf (MediaBox from grandparent)");
|
||||
}
|
||||
|
||||
/// Create PDF with no MediaBox anywhere (should default to US Letter)
|
||||
fn create_missing_mediabox() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Length 40>>stream
|
||||
BT /F1 12 Tf 100 700 Td (No MediaBox) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Type/Page/Parent 3 0 R/Contents 1 0 R/Resources<</Font<</F1 4 0 R>>>>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/Pages/Count 1/Kids[2 0 R]/Resources<</Font<</F1 4 0 R>>>>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Catalog/Pages 3 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000071 00000 n
|
||||
0000000184 00000 n
|
||||
0000000297 00000 n
|
||||
0000000370 00000 n
|
||||
trailer
|
||||
<</Size 6/Root 5 0 R>>
|
||||
startxref
|
||||
473
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/missing_mediabox.pdf", &pdf);
|
||||
println!("Created missing_mediabox.pdf (no MediaBox, defaults to US Letter)");
|
||||
}
|
||||
|
||||
/// Create partial /Resources override fixture
|
||||
fn create_partial_resource_override() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Times-Roman>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Courier>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/XObject/Subtype/Image/Width 100/Height 100>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Length 49>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Test Override) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Font<</F1 1 0 R/F2 2 0 R>>/XObject<</Im1 4 0 R>>>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Font<</F1 3 0 R/F3 1 0 R>>>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Page/Parent 9 0 R/Contents 5 0 R/Resources 7 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<</Type/Pages/Count 1/Kids[8 0 R]/Resources 6 0 R>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<</Type/Catalog/Pages 9 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 11
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000074 00000 n
|
||||
0000000157 00000 n
|
||||
0000000240 00000 n
|
||||
0000000331 00000 n
|
||||
0000000412 00000 n
|
||||
0000000513 00000 n
|
||||
0000000586 00000 n
|
||||
0000000729 00000 n
|
||||
0000000802 00000 n
|
||||
trailer
|
||||
<</Size 11/Root 10 0 R>>
|
||||
startxref
|
||||
899
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/partial_resource_override.pdf", &pdf);
|
||||
println!("Created partial_resource_override.pdf (partial /Resources override)");
|
||||
}
|
||||
|
||||
/// Create PDF with /OpenAction /S /JavaScript
|
||||
fn create_js_in_openaction() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 35>>stream
|
||||
BT /F1 12 Tf 100 700 Td (JS Test) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</S/JavaScript/JS(app.alert('Hello'))>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 5 0 R>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 1/Kids[4 0 R]>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Catalog/Pages 5 0 R/OpenAction 3 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 7
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000135 00000 n
|
||||
0000000246 00000 n
|
||||
0000000425 00000 n
|
||||
0000000478 00000 n
|
||||
trailer
|
||||
<</Size 7/Root 6 0 R>>
|
||||
startxref
|
||||
551
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/js_in_openaction.pdf", &pdf);
|
||||
println!("Created js_in_openaction.pdf (/OpenAction /S /JavaScript)");
|
||||
}
|
||||
|
||||
/// Create PDF with /AcroForm /XFA
|
||||
fn create_xfa_form() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (XFA) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</XFA(template)>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 5 0 R>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 1/Kids[4 0 R]>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Catalog/Pages 5 0 R/AcroForm 3 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 7
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000127 00000 n
|
||||
0000000182 00000 n
|
||||
0000000353 00000 n
|
||||
0000000406 00000 n
|
||||
trailer
|
||||
<</Size 7/Root 6 0 R>>
|
||||
startxref
|
||||
479
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/xfa_form.pdf", &pdf);
|
||||
println!("Created xfa_form.pdf (/AcroForm /XFA present)");
|
||||
}
|
||||
|
||||
/// Create PDF/A-1B conformance with XMP metadata
|
||||
fn create_pdfa_1b_conformance() {
|
||||
let xmp = r#"<?xpacket begin="?" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.6-c140 79.160451">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about="" xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
|
||||
<pdfaid:part>1</pdfaid:part>
|
||||
<pdfaid:conformance>B</pdfaid:conformance>
|
||||
</rdf:Description>
|
||||
</rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
<?xpacket end="w"?>"#;
|
||||
|
||||
let xmp_bytes = xmp.as_bytes();
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 37>>stream
|
||||
BT /F1 12 Tf 100 700 Td (PDF/A-1B) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/Metadata/Subtype/XML/Length {}>>
|
||||
stream
|
||||
{}
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 5 0 R>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Pages/Count 1/Kids[4 0 R]>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type/Catalog/Pages 5 0 R/Metadata 3 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 7
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000131 00000 n
|
||||
000000{:04} 00000 n
|
||||
000000{:04} 00000 n
|
||||
000000{:04} 00000 n
|
||||
trailer
|
||||
<</Size 7/Root 6 0 R>>
|
||||
startxref
|
||||
{:04}
|
||||
%%EOF
|
||||
"#,
|
||||
xmp_bytes.len(),
|
||||
xmp,
|
||||
xmp_bytes.len() + 179,
|
||||
xmp_bytes.len() + 336,
|
||||
xmp_bytes.len() + 425,
|
||||
xmp_bytes.len() + 518
|
||||
);
|
||||
|
||||
write_pdf("tests/document_model/fixtures/pdfa_1b_conformance.pdf", &pdf);
|
||||
println!("Created pdfa_1b_conformance.pdf (XMP PDF/A-1B metadata)");
|
||||
}
|
||||
|
||||
/// Create page labels: pages 0-3 roman, pages 4+ arabic
|
||||
fn create_page_labels_roman_arabic() {
|
||||
let pdf = format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page i) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page ii) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page iii) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page iv) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Length 33>>stream
|
||||
BT /F1 12 Tf 100 700 Td (Page 1) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type/Pages/Count 5/Kids[8 0 R 9 0 R 10 0 R 11 0 R 12 0 R]/MediaBox[0 0 612 792]/Resources<</Font<</F1 1 0 R>>>>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<</Type/Page/Parent 7 0 R/Contents 2 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<</Type/Page/Parent 7 0 R/Contents 3 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<</Type/Page/Parent 7 0 R/Contents 4 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<</Type/Page/Parent 7 0 R/Contents 5 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
12 0 obj
|
||||
<</Type/Page/Parent 7 0 R/Contents 6 0 R/MediaBox[0 0 612 792]>>
|
||||
endobj
|
||||
13 0 obj
|
||||
<</Nums[0 14 0 R 4 15 0 R]>>
|
||||
endobj
|
||||
14 0 obj
|
||||
<</S/r/St 1>>
|
||||
endobj
|
||||
15 0 obj
|
||||
<</S/D/St 1>>
|
||||
endobj
|
||||
16 0 obj
|
||||
<</Type/Catalog/Pages 7 0 R/PageLabels 13 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 17
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
0000000135 00000 n
|
||||
0000000208 00000 n
|
||||
0000000281 00000 n
|
||||
0000000354 00000 n
|
||||
0000000427 00000 n
|
||||
0000000600 00000 n
|
||||
0000000703 00000 n
|
||||
0000000806 00000 n
|
||||
0000000909 00000 n
|
||||
0000001012 00000 n
|
||||
0000001115 00000 n
|
||||
0000001150 00000 n
|
||||
0000001175 00000 n
|
||||
0000001200 00000 n
|
||||
trailer
|
||||
<</Size 17/Root 16 0 R>>
|
||||
startxref
|
||||
1283
|
||||
%%EOF
|
||||
"#
|
||||
);
|
||||
write_pdf("tests/document_model/fixtures/page_labels_roman_arabic.pdf", &pdf);
|
||||
println!("Created page_labels_roman_arabic.pdf (roman 0-3, arabic 4+)");
|
||||
}
|
||||
|
||||
/// Create a minimal valid PDF document
|
||||
fn minimal_pdf(title: &str, content: &str) -> String {
|
||||
format!(
|
||||
r#"%PDF-1.4
|
||||
1 0 obj
|
||||
<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length {}>>stream
|
||||
BT /F1 12 Tf 100 700 Td ({}) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Type/Page/MediaBox[0 0 612 792]/Contents 2 0 R/Resources<</Font<</F1 1 0 R>>>>/Parent 4 0 R>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<</Type/Pages/Count 1/Kids[3 0 R]>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type/Catalog/Pages 4 0 R>>
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000062 00000 n
|
||||
000000{:04} 00000 n
|
||||
000000{:04} 00000 n
|
||||
000000{:04} 00000 n
|
||||
trailer
|
||||
<</Size 6/Root 5 0 R>>
|
||||
startxref
|
||||
{:04}
|
||||
%%EOF
|
||||
"#,
|
||||
content.len() + 30,
|
||||
content,
|
||||
content.len() + 135,
|
||||
content.len() + 264,
|
||||
content.len() + 357,
|
||||
content.len() + 446
|
||||
)
|
||||
}
|
||||
|
||||
/// Write PDF content to a file
|
||||
fn write_pdf(path: &str, content: &str) {
|
||||
let mut file = File::create(path).expect("Failed to create PDF file");
|
||||
file.write_all(content.as_bytes()).expect("Failed to write PDF content");
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001674 00000 n
|
||||
0000001939 00000 n
|
||||
0000002205 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2472
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001674 00000 n
|
||||
0000001939 00000 n
|
||||
0000002205 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2472
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001771 00000 n
|
||||
0000002036 00000 n
|
||||
0000002302 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2569
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -79,7 +79,7 @@ xref
|
|||
0000001639 00000 n
|
||||
0000001972 00000 n
|
||||
0000002305 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><1257e81a66d93003d6e81c7345208637>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><3c1bda1da015a59c312bf92410d1a7c1>] >>
|
||||
startxref
|
||||
2639
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -63,7 +63,7 @@ xref
|
|||
0000001640 00000 n
|
||||
0000001905 00000 n
|
||||
0000002171 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><5c37d64d59a257b08239b1dafee61423>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><7d443b0ed8ffc05490a03747cda9155f>] >>
|
||||
startxref
|
||||
2438
|
||||
%%EOF
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ stream
|
|||
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-28T23:02:07.359789+00:00"/></rdf:RDF>
|
||||
<rdf:Description rdf:about=""><dc:title xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Alt><rdf:li xml:lang="x-default">Fingerprint Test Source</rdf:li></rdf:Alt></dc:title></rdf:Description><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" rdf:about="" dc:creator="pdftract test suite"/><rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" rdf:about="" pdf:Producer="pikepdf 9.2.1"/><rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" rdf:about="" xmp:MetadataDate="2026-05-29T01:05:40.352232+00:00"/></rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
|
||||
<?xpacket end="w"?>
|
||||
|
|
@ -79,7 +79,7 @@ xref
|
|||
0000001639 00000 n
|
||||
0000001972 00000 n
|
||||
0000002305 00000 n
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<5c37d64d59a257b08239b1dafee61423><2e9fb4dee29e731cbdedf48995168813>] >>
|
||||
trailer << /Info 2 0 R /Root 1 0 R /Size 11 /ID [<7d443b0ed8ffc05490a03747cda9155f><8c3dff7450e222f54fc4a0463e6e502b>] >>
|
||||
startxref
|
||||
2639
|
||||
%%EOF
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Loading…
Add table
Reference in a new issue