Rewrote FileSource to use memmap2 for zero-copy random access. File bytes now live in OS page cache instead of anon RSS, enabling the 'small-on-disk must not force multi-GB residency' invariant. Changes: - Added memmap2 = "0.9" dependency to pdftract-core - Replaced fs::File-based FileSource with memmap2::Mmap - Added source_tests module with 5 unit tests (all pass) - Removed fs::read fallback for unbounded files per Anti-Patterns Closes: bf-2ervu
923 lines
33 KiB
Rust
923 lines
33 KiB
Rust
//! PDF fixture generator for xref testing.
|
|
//!
|
|
//! This tool generates minimal PDF files with specific xref structures
|
|
//! for testing the pdftract xref resolver.
|
|
|
|
use std::fs::File;
|
|
use std::io::{BufWriter, Seek, Write};
|
|
use std::path::PathBuf;
|
|
use std::process;
|
|
|
|
/// PDF fixture type.
|
|
#[derive(Debug, Clone, Copy)]
|
|
enum FixtureType {
|
|
/// Well-formed PDF with traditional xref table.
|
|
WellFormedTraditional,
|
|
/// Well-formed PDF with xref stream (PDF 1.5).
|
|
WellFormedStream,
|
|
/// Hybrid file with traditional xref + /XRefStm.
|
|
HybridFile,
|
|
/// PDF with 3 incremental revisions (/Prev chain).
|
|
PrevChain3Revisions,
|
|
/// Linearized PDF (50 pages).
|
|
Linearized,
|
|
/// File truncated at the start of xref.
|
|
TruncatedAfterXref,
|
|
/// File with startxref offset off by one.
|
|
StartxrefOffByOne,
|
|
/// File with one corrupt xref entry.
|
|
CorruptXrefEntry,
|
|
/// File with circular /Prev reference.
|
|
CircularPrev,
|
|
/// File with 50 incremental revisions (tests depth limit).
|
|
DeepPrevChain,
|
|
}
|
|
|
|
impl FixtureType {
|
|
fn name(&self) -> &'static str {
|
|
match self {
|
|
Self::WellFormedTraditional => "well_formed_traditional.pdf",
|
|
Self::WellFormedStream => "well_formed_stream.pdf",
|
|
Self::HybridFile => "hybrid_file.pdf",
|
|
Self::PrevChain3Revisions => "prev_chain_3_revisions.pdf",
|
|
Self::Linearized => "linearized.pdf",
|
|
Self::TruncatedAfterXref => "truncated_after_xref.pdf",
|
|
Self::StartxrefOffByOne => "startxref_off_by_one.pdf",
|
|
Self::CorruptXrefEntry => "corrupt_xref_entry.pdf",
|
|
Self::CircularPrev => "circular_prev.pdf",
|
|
Self::DeepPrevChain => "deep_prev_chain.pdf",
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Fixture generator context.
|
|
struct Generator {
|
|
output_dir: PathBuf,
|
|
}
|
|
|
|
impl Generator {
|
|
fn new(output_dir: PathBuf) -> Self {
|
|
Self { output_dir }
|
|
}
|
|
|
|
/// Generate a single fixture.
|
|
fn generate(&self, fixture_type: FixtureType) {
|
|
let filename = PathBuf::from(fixture_type.name());
|
|
let output_path = self.output_dir.join(filename);
|
|
|
|
match fixture_type {
|
|
FixtureType::WellFormedTraditional => {
|
|
self.generate_well_formed_traditional(&output_path);
|
|
}
|
|
FixtureType::WellFormedStream => {
|
|
self.generate_well_formed_stream(&output_path);
|
|
}
|
|
FixtureType::HybridFile => {
|
|
self.generate_hybrid_file(&output_path);
|
|
}
|
|
FixtureType::PrevChain3Revisions => {
|
|
self.generate_prev_chain_3(&output_path);
|
|
}
|
|
FixtureType::Linearized => {
|
|
self.generate_linearized(&output_path);
|
|
}
|
|
FixtureType::TruncatedAfterXref => {
|
|
// Start with well-formed, then truncate
|
|
let base_path = self
|
|
.output_dir
|
|
.join(FixtureType::WellFormedTraditional.name());
|
|
self.generate_truncated(&base_path, &output_path);
|
|
}
|
|
FixtureType::StartxrefOffByOne => {
|
|
// Start with well-formed, then modify startxref
|
|
let base_path = self
|
|
.output_dir
|
|
.join(FixtureType::WellFormedTraditional.name());
|
|
self.generate_startxref_off_by_one(&base_path, &output_path);
|
|
}
|
|
FixtureType::CorruptXrefEntry => {
|
|
// Start with well-formed, then corrupt one entry
|
|
let base_path = self
|
|
.output_dir
|
|
.join(FixtureType::WellFormedTraditional.name());
|
|
self.generate_corrupt_entry(&base_path, &output_path);
|
|
}
|
|
FixtureType::CircularPrev => {
|
|
self.generate_circular_prev(&output_path);
|
|
}
|
|
FixtureType::DeepPrevChain => {
|
|
self.generate_deep_prev_chain(&output_path);
|
|
}
|
|
}
|
|
|
|
println!("Generated: {:?}", output_path);
|
|
}
|
|
|
|
/// Generate a well-formed PDF with traditional xref table.
|
|
fn generate_well_formed_traditional(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header
|
|
writeln!(w, "%PDF-1.4").unwrap();
|
|
|
|
// Object 1: Catalog
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 2: Page tree root
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 3: Page
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, " /Resources << /Font << >> >>").unwrap();
|
|
writeln!(w, " /Contents 4 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 4: Contents (empty stream)
|
|
writeln!(w, "4 0 obj").unwrap();
|
|
writeln!(w, "<< /Length 0 >>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
writeln!(w, "endstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 5: Info
|
|
writeln!(w, "5 0 obj").unwrap();
|
|
writeln!(w, "<< /Title (Test Document)").unwrap();
|
|
writeln!(w, " /Producer (build-xref-fixture)").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Track xref offset
|
|
let xref_offset = w.stream_position().unwrap();
|
|
|
|
// Traditional xref table
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 6").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
writeln!(w, "0000000017 00000 n ").unwrap(); // Object 1
|
|
writeln!(w, "0000000082 00000 n ").unwrap(); // Object 2
|
|
writeln!(w, "0000000160 00000 n ").unwrap(); // Object 3
|
|
writeln!(w, "0000000269 00000 n ").unwrap(); // Object 4
|
|
writeln!(w, "0000000341 00000 n ").unwrap(); // Object 5
|
|
|
|
// Trailer
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 6").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Info 5 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
// startxref
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref_offset).unwrap();
|
|
|
|
// EOF
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a well-formed PDF with xref stream (PDF 1.5).
|
|
fn generate_well_formed_stream(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header (1.5 for xref stream support)
|
|
writeln!(w, "%PDF-1.5").unwrap();
|
|
|
|
// Object 1: Catalog
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 2: Page tree root
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 3: Page
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, " /Resources << /Font << >> >>").unwrap();
|
|
writeln!(w, " /Contents 4 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 4: Contents (empty stream)
|
|
writeln!(w, "4 0 obj").unwrap();
|
|
writeln!(w, "<< /Length 0 >>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
writeln!(w, "endstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Track xref stream offset
|
|
let xref_stream_offset = w.stream_position().unwrap();
|
|
|
|
// Object 5: XRef stream
|
|
// /W = [1 4 2] means: type=1 byte, offset=4 bytes, gen=2 bytes
|
|
writeln!(w, "5 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /XRef").unwrap();
|
|
writeln!(w, " /Size 6").unwrap();
|
|
writeln!(w, " /W [1 4 2]").unwrap();
|
|
writeln!(w, " /Index [0 6]").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
|
|
// Xref stream data:
|
|
// Entry 0: type 0 (free), next_free=0, gen=65535
|
|
// Entry 1: type 1 (in-use), offset=17, gen=0
|
|
// Entry 2: type 1 (in-use), offset=82, gen=0
|
|
// Entry 3: type 1 (in-use), offset=160, gen=0
|
|
// Entry 4: type 1 (in-use), offset=269, gen=0
|
|
// Entry 5: type 1 (in-use), offset=348, gen=0
|
|
let xref_data = [
|
|
// Type=1 byte, Offset=4 bytes (big-endian), Gen=2 bytes (big-endian)
|
|
0u8, 0, 0, 0, 0, 255, 255, // Entry 0: free
|
|
1, 0, 0, 0, 17, 0, 0, // Entry 1: in-use at offset 17
|
|
1, 0, 0, 0, 82, 0, 0, // Entry 2: in-use at offset 82
|
|
1, 0, 0, 0, 160, 0, 0, // Entry 3: in-use at offset 160
|
|
1, 0, 0, 1, 13, 0, 0, // Entry 4: in-use at offset 269
|
|
1, 0, 0, 1, 92, 0, 0, // Entry 5: in-use at offset 348 (this stream itself)
|
|
];
|
|
|
|
w.write_all(&xref_data).unwrap();
|
|
writeln!(w, "\nendstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// startxref
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref_stream_offset).unwrap();
|
|
|
|
// EOF
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a hybrid file with traditional xref + /XRefStm.
|
|
fn generate_hybrid_file(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header (1.5 for hybrid support)
|
|
writeln!(w, "%PDF-1.5").unwrap();
|
|
|
|
// Object 1: Catalog
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 2: Page tree root
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 3: Page
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, " /Resources << /Font << >> >>").unwrap();
|
|
writeln!(w, " /Contents 4 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 4: Contents (empty stream)
|
|
writeln!(w, "4 0 obj").unwrap();
|
|
writeln!(w, "<< /Length 0 >>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
writeln!(w, "endstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 5: XRef stream (will be referenced from /XRefStm)
|
|
writeln!(w, "5 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /XRef").unwrap();
|
|
writeln!(w, " /Size 7").unwrap();
|
|
writeln!(w, " /W [1 4 2]").unwrap();
|
|
writeln!(w, " /Index [0 7]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
|
|
// Xref stream data with one overlapping entry (object 6)
|
|
let xref_data = [
|
|
0u8, 0, 0, 0, 0, 255, 255, // Entry 0: free
|
|
0, 0, 0, 0, 0, 0, 0, // Entry 1: free (overlaps traditional)
|
|
0, 0, 0, 0, 0, 0, 0, // Entry 2: free
|
|
0, 0, 0, 0, 0, 0, 0, // Entry 3: free
|
|
0, 0, 0, 0, 0, 0, 0, // Entry 4: free
|
|
0, 0, 0, 0, 0, 0, 0, // Entry 5: free
|
|
1, 0, 0, 1, 244, 0, 0, // Entry 6: new object in stream only (offset 500)
|
|
];
|
|
|
|
w.write_all(&xref_data).unwrap();
|
|
writeln!(w, "\nendstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 6: Additional object (only in xref stream)
|
|
writeln!(w, "6 0 obj").unwrap();
|
|
writeln!(w, "(Additional object)").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Track xref offset
|
|
let xref_offset = w.stream_position().unwrap();
|
|
|
|
// Traditional xref table (covers objects 0-5)
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 6").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
writeln!(w, "0000000017 00000 n ").unwrap(); // Object 1 (overlaps with stream's free entry)
|
|
writeln!(w, "0000000082 00000 n ").unwrap(); // Object 2
|
|
writeln!(w, "0000000160 00000 n ").unwrap(); // Object 3
|
|
writeln!(w, "0000000269 00000 n ").unwrap(); // Object 4
|
|
writeln!(w, "0000000341 00000 n ").unwrap(); // Object 5
|
|
|
|
// Trailer with /XRefStm
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 7").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /XRefStm 341").unwrap(); // Points to object 5 (xref stream)
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
// startxref
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref_offset).unwrap();
|
|
|
|
// EOF
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a PDF with 3 incremental revisions.
|
|
fn generate_prev_chain_3(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header
|
|
writeln!(w, "%PDF-1.4").unwrap();
|
|
|
|
// === Revision 1 (baseline) ===
|
|
|
|
// Object 1: Catalog
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 2: Page tree root
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 3: Page
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 4: Info
|
|
writeln!(w, "4 0 obj").unwrap();
|
|
writeln!(w, "<< /Title (Revision 1)>>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 5: Will be modified in revision 2
|
|
writeln!(w, "5 0 obj").unwrap();
|
|
writeln!(w, "(Original value)").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
let xref1_offset = w.stream_position().unwrap();
|
|
|
|
// First xref + trailer
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 6").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
writeln!(w, "0000000017 00000 n ").unwrap();
|
|
writeln!(w, "0000000082 00000 n ").unwrap();
|
|
writeln!(w, "0000000160 00000 n ").unwrap();
|
|
writeln!(w, "0000000249 00000 n ").unwrap();
|
|
writeln!(w, "0000000290 00000 n ").unwrap();
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 6").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref1_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
// === Revision 2 (incremental update) ===
|
|
|
|
// Modify object 5
|
|
writeln!(w, "5 1 obj").unwrap();
|
|
writeln!(w, "(Modified in revision 2)").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Add object 6
|
|
writeln!(w, "6 0 obj").unwrap();
|
|
writeln!(w, "(Added in revision 2)").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
let xref2_offset = w.stream_position().unwrap();
|
|
|
|
// Second xref + trailer with /Prev
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "5 2").unwrap();
|
|
writeln!(w, "0000000341 00001 n ").unwrap(); // Object 5, gen 1
|
|
writeln!(w, "0000000382 00000 n ").unwrap(); // Object 6, gen 0
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 7").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Prev {}", xref1_offset).unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref2_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
// === Revision 3 (another incremental update) ===
|
|
|
|
// Modify object 5 again
|
|
writeln!(w, "5 2 obj").unwrap();
|
|
writeln!(w, "(Modified in revision 3)").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
let xref3_offset = w.stream_position().unwrap();
|
|
|
|
// Third xref + trailer with /Prev
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "5 1").unwrap();
|
|
writeln!(w, "0000000433 00002 n ").unwrap(); // Object 5, gen 2
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 7").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Prev {}", xref2_offset).unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref3_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a linearized PDF (50 pages).
|
|
fn generate_linearized(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header
|
|
writeln!(w, "%PDF-1.4").unwrap();
|
|
|
|
let _lin_dict_offset = w.stream_position().unwrap();
|
|
|
|
// Linearized dictionary (object 1)
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Linearized 1.0").unwrap();
|
|
writeln!(w, " /L 10000").unwrap(); // Placeholder file length
|
|
writeln!(w, " /H [1010 50]").unwrap(); // Hint stream offset/length
|
|
writeln!(w, " /O 4").unwrap(); // First page object number
|
|
writeln!(w, " /E 500").unwrap(); // End of first page
|
|
writeln!(w, " /N 50").unwrap(); // Number of pages
|
|
writeln!(w, " /T 6000").unwrap(); // Offset of first-page xref
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 2: First-page xref (partial, for linearized viewing)
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /XRef").unwrap();
|
|
writeln!(w, " /Size 6").unwrap();
|
|
writeln!(w, " /W [1 4 2]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
// Minimal xref data for first page objects
|
|
let first_page_xref = [
|
|
0u8, 0, 0, 0, 0, 255, 255, 1, 0, 0, 0, 17, 0, 0, 1, 0, 0, 0, 120, 0, 0, 1, 0, 0, 0,
|
|
210, 0, 0, 1, 0, 0, 1, 44, 0, 0,
|
|
];
|
|
w.write_all(&first_page_xref).unwrap();
|
|
writeln!(w, "\nendstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 3: Hint stream
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Length 0 >>").unwrap();
|
|
writeln!(w, "stream").unwrap();
|
|
writeln!(w, "endstream").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 4: First page
|
|
writeln!(w, "4 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Object 5: Catalog
|
|
writeln!(w, "5 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 6 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Placeholder for remaining pages...
|
|
for i in 6..60 {
|
|
writeln!(w, "{} 0 obj", i).unwrap();
|
|
writeln!(w, "(Page {})", i).unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
}
|
|
|
|
// Full xref at EOF (placeholder offset)
|
|
let full_xref_offset = w.stream_position().unwrap();
|
|
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 60").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
for i in 1..60 {
|
|
writeln!(w, "0000000{} 00000 n ", i).unwrap();
|
|
}
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 60").unwrap();
|
|
writeln!(w, " /Root 5 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", full_xref_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a truncated file from a base file.
|
|
fn generate_truncated(&self, base_path: &PathBuf, output_path: &PathBuf) {
|
|
// Read base file
|
|
let base_data = std::fs::read(base_path).unwrap_or_else(|e| {
|
|
panic!("Failed to read base file {:?}: {}", base_path, e);
|
|
});
|
|
|
|
// Find the xref keyword
|
|
let xref_pos = base_data
|
|
.windows(4)
|
|
.rposition(|w| w == b"xref")
|
|
.expect("xref keyword not found in base file");
|
|
|
|
// Truncate just before the xref table
|
|
let truncated_len = xref_pos;
|
|
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
w.write_all(&base_data[..truncated_len]).unwrap();
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a file with startxref offset off by one.
|
|
fn generate_startxref_off_by_one(&self, base_path: &PathBuf, output_path: &PathBuf) {
|
|
// Read base file
|
|
let base_data = std::fs::read(base_path).unwrap_or_else(|e| {
|
|
panic!("Failed to read base file {:?}: {}", base_path, e);
|
|
});
|
|
|
|
// Find "startxref" and modify the offset after it
|
|
let startxref_pos = base_data
|
|
.windows(9)
|
|
.rposition(|w| w == b"startxref")
|
|
.expect("startxref keyword not found in base file");
|
|
|
|
// Parse the offset after startxref
|
|
let after_startxref = &base_data[startxref_pos + 9..];
|
|
let offset_str_end = after_startxref
|
|
.iter()
|
|
.position(|&b| b == b'\n' || b == b'\r')
|
|
.unwrap_or(after_startxref.len());
|
|
|
|
let offset_str = std::str::from_utf8(&after_startxref[..offset_str_end]).unwrap_or("0");
|
|
|
|
if let Ok(mut offset) = offset_str.parse::<u64>() {
|
|
// Modify offset by +1
|
|
offset += 1;
|
|
|
|
// Replace the offset in the data
|
|
let new_offset_str = offset.to_string();
|
|
let new_bytes = new_offset_str.as_bytes();
|
|
|
|
// Ensure we have enough space
|
|
let replacement_start = startxref_pos + 9;
|
|
let replacement_end = replacement_start + offset_str_end;
|
|
|
|
let mut new_data = base_data.to_vec();
|
|
new_data[replacement_start..replacement_end].copy_from_slice(new_bytes);
|
|
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
w.write_all(&new_data).unwrap();
|
|
w.flush().unwrap();
|
|
}
|
|
}
|
|
|
|
/// Generate a file with one corrupt xref entry.
|
|
fn generate_corrupt_entry(&self, base_path: &PathBuf, output_path: &PathBuf) {
|
|
// Read base file
|
|
let mut base_data = std::fs::read(base_path).unwrap_or_else(|e| {
|
|
panic!("Failed to read base file {:?}: {}", base_path, e);
|
|
});
|
|
|
|
// Find the xref table
|
|
let xref_pos = base_data
|
|
.windows(4)
|
|
.rposition(|w| w == b"xref")
|
|
.expect("xref keyword not found in base file");
|
|
|
|
// Find the first xref entry (after "0 6\n")
|
|
let entries_start = xref_pos + 4;
|
|
|
|
// Find the first newline after the subsection header
|
|
let header_end = base_data[entries_start..]
|
|
.iter()
|
|
.position(|&b| b == b'\n')
|
|
.map(|p| entries_start + p)
|
|
.unwrap_or(entries_start);
|
|
|
|
// Corrupt the first non-zero entry (object 1)
|
|
// Each entry is 20 bytes, skip object 0 (free entry)
|
|
let entry1_start = header_end + 1 + 20;
|
|
|
|
if entry1_start + 10 <= base_data.len() {
|
|
// Modify the offset to be invalid
|
|
base_data[entry1_start..entry1_start + 10].copy_from_slice(b"9999999999");
|
|
}
|
|
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
w.write_all(&base_data).unwrap();
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a file with circular /Prev reference.
|
|
fn generate_circular_prev(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header
|
|
writeln!(w, "%PDF-1.4").unwrap();
|
|
|
|
// Minimal objects
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Calculate the offset of Xref B by generating it first to an in-memory buffer
|
|
let mut xref_b_data = Vec::new();
|
|
{
|
|
let mut w_b = BufWriter::new(&mut xref_b_data);
|
|
writeln!(w_b, "xref").unwrap();
|
|
writeln!(w_b, "0 1").unwrap();
|
|
writeln!(w_b, "0000000000 65535 f ").unwrap();
|
|
|
|
writeln!(w_b, "trailer").unwrap();
|
|
writeln!(w_b, "<< /Size 4").unwrap();
|
|
writeln!(w_b, " /Root 1 0 R").unwrap();
|
|
writeln!(w_b, ">>").unwrap(); // /Prev will be added later
|
|
|
|
writeln!(w_b, "startxref").unwrap();
|
|
writeln!(w_b, "0").unwrap(); // Placeholder
|
|
writeln!(w_b, "%%EOF").unwrap();
|
|
w_b.flush().unwrap();
|
|
}
|
|
|
|
// Now we know the approximate size of Xref B
|
|
// Calculate Xref A offset (current position)
|
|
let xref_a_offset = w.stream_position().unwrap();
|
|
|
|
// Calculate Xref B offset (Xref A offset + size of Xref A)
|
|
let xref_a_size = 200; // Approximate size of first xref + trailer
|
|
let xref_b_offset = xref_a_offset + xref_a_size;
|
|
|
|
// Xref A points to Xref B
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 4").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
writeln!(w, "0000000017 00000 n ").unwrap();
|
|
writeln!(w, "0000000082 00000 n ").unwrap();
|
|
writeln!(w, "0000000160 00000 n ").unwrap();
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 4").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Prev {}", xref_b_offset).unwrap(); // Points to Xref B
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", xref_a_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
// Xref B points back to Xref A (creates cycle)
|
|
// Get the actual offset now
|
|
let actual_xref_b_offset = w.stream_position().unwrap();
|
|
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 1").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 4").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Prev {}", xref_a_offset).unwrap(); // Points back to Xref A
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", actual_xref_b_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
|
|
/// Generate a file with 50 incremental revisions (tests depth limit).
|
|
fn generate_deep_prev_chain(&self, output_path: &PathBuf) {
|
|
let file = File::create(output_path).unwrap_or_else(|e| {
|
|
panic!("Failed to create {:?}: {}", output_path, e);
|
|
});
|
|
let mut w = BufWriter::new(file);
|
|
|
|
// PDF header
|
|
writeln!(w, "%PDF-1.4").unwrap();
|
|
|
|
// Minimal baseline objects
|
|
writeln!(w, "1 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Catalog").unwrap();
|
|
writeln!(w, " /Pages 2 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
writeln!(w, "2 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Pages").unwrap();
|
|
writeln!(w, " /Kids [3 0 R]").unwrap();
|
|
writeln!(w, " /Count 1").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
writeln!(w, "3 0 obj").unwrap();
|
|
writeln!(w, "<< /Type /Page").unwrap();
|
|
writeln!(w, " /Parent 2 0 R").unwrap();
|
|
writeln!(w, " /MediaBox [0 0 612 792]").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
// Baseline xref
|
|
let mut prev_offset = w.stream_position().unwrap();
|
|
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "0 4").unwrap();
|
|
writeln!(w, "0000000000 65535 f ").unwrap();
|
|
writeln!(w, "0000000017 00000 n ").unwrap();
|
|
writeln!(w, "0000000082 00000 n ").unwrap();
|
|
writeln!(w, "0000000160 00000 n ").unwrap();
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size 4").unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", prev_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
// Generate 50 incremental revisions
|
|
for i in 1..=50 {
|
|
// Add a new object in each revision
|
|
writeln!(w, "{} 0 obj", 3 + i).unwrap();
|
|
writeln!(w, "(Revision {})", i).unwrap();
|
|
writeln!(w, "endobj").unwrap();
|
|
|
|
let new_offset = w.stream_position().unwrap();
|
|
|
|
writeln!(w, "xref").unwrap();
|
|
writeln!(w, "{} 1", 3 + i).unwrap();
|
|
let offset = i * 50 + 200;
|
|
let offset_str = format!("{:010}", offset);
|
|
writeln!(w, "{} 00000 n ", offset_str).unwrap();
|
|
|
|
writeln!(w, "trailer").unwrap();
|
|
writeln!(w, "<< /Size {}", 4 + i).unwrap();
|
|
writeln!(w, " /Root 1 0 R").unwrap();
|
|
writeln!(w, " /Prev {}", prev_offset).unwrap();
|
|
writeln!(w, ">>").unwrap();
|
|
|
|
writeln!(w, "startxref").unwrap();
|
|
writeln!(w, "{}", new_offset).unwrap();
|
|
writeln!(w, "%%EOF").unwrap();
|
|
|
|
prev_offset = new_offset;
|
|
}
|
|
|
|
w.flush().unwrap();
|
|
}
|
|
}
|
|
|
|
fn main() {
|
|
let args: Vec<String> = std::env::args().collect();
|
|
|
|
if args.len() < 2 {
|
|
eprintln!("Usage: {} <output-dir>", args[0]);
|
|
eprintln!("\nGenerates PDF fixtures for xref testing.");
|
|
process::exit(1);
|
|
}
|
|
|
|
let output_dir = PathBuf::from(&args[1]);
|
|
|
|
// Create output directory if it doesn't exist
|
|
std::fs::create_dir_all(&output_dir).unwrap_or_else(|e| {
|
|
panic!("Failed to create output directory {:?}: {}", output_dir, e);
|
|
});
|
|
|
|
let gen = Generator::new(output_dir);
|
|
|
|
// Generate all fixture types
|
|
for fixture_type in [
|
|
FixtureType::WellFormedTraditional,
|
|
FixtureType::WellFormedStream,
|
|
FixtureType::HybridFile,
|
|
FixtureType::PrevChain3Revisions,
|
|
FixtureType::Linearized,
|
|
FixtureType::TruncatedAfterXref,
|
|
FixtureType::StartxrefOffByOne,
|
|
FixtureType::CorruptXrefEntry,
|
|
FixtureType::CircularPrev,
|
|
FixtureType::DeepPrevChain,
|
|
] {
|
|
gen.generate(fixture_type);
|
|
}
|
|
|
|
println!("\nAll fixtures generated successfully!");
|
|
println!("Run with BLESS=1 to generate golden files:");
|
|
println!(" BLESS=1 cargo test -p pdftract-core --test integration -- xref");
|
|
}
|