- Add Pdftract.swift.tera for main public API with type aliases - Update Methods.swift.tera with async throws functions and AsyncThrowingStream for streaming - Update Errors.swift.tera with 8 error types implementing LocalizedError - Update Types.swift.tera with Source enum, Options structs, and all Codable types - Update ConformanceTests.swift.tera with XCTest-based conformance suite - Update README.md.tera with full documentation (install, usage, error handling) - Update Package.swift.tera with macOS(.v13) and Linux platform support Closes pdftract-5lvpu
2506 lines
85 KiB
Rust
2506 lines
85 KiB
Rust
use fontdue::Font;
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::Value;
|
|
use std::collections::{BTreeMap, HashMap};
|
|
use std::fs;
|
|
use std::path::{Path, PathBuf};
|
|
use std::process::{Command, Stdio};
|
|
use std::time::{Duration, Instant};
|
|
|
|
// Import for Context trait used in validate_schema
|
|
use anyhow::Context as AnyhowContext;
|
|
|
|
/// Helper macro for creating dictionaries
|
|
macro_rules! dictionary {
|
|
($( $key:literal => $value:expr ),* $(,)?) => {{
|
|
let mut dict = Dictionary::new();
|
|
$(
|
|
dict.set($key, $value);
|
|
)*
|
|
dict
|
|
}};
|
|
}
|
|
|
|
/// Find the workspace root directory by searching for Cargo.toml
|
|
fn find_workspace_root() -> PathBuf {
|
|
let mut current = std::env::current_dir().unwrap();
|
|
|
|
// If we're in the xtask directory, go to parent
|
|
if current.ends_with("xtask") {
|
|
current = current.parent().unwrap().to_path_buf();
|
|
}
|
|
|
|
// Search upward for Cargo.toml with workspace members
|
|
loop {
|
|
let cargo_toml = current.join("Cargo.toml");
|
|
if cargo_toml.exists() {
|
|
let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
|
|
if content.contains("[workspace]") {
|
|
return current;
|
|
}
|
|
}
|
|
|
|
match current.parent() {
|
|
Some(parent) => current = parent.to_path_buf(),
|
|
None => break,
|
|
}
|
|
}
|
|
|
|
// Fallback: use current directory if not found
|
|
std::env::current_dir().unwrap()
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct Profile {
|
|
description: String,
|
|
#[serde(default)]
|
|
profile_fields: BTreeMap<String, ProfileField>,
|
|
#[serde(default)]
|
|
r#match: MatchConfig,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct ProfileField {
|
|
#[serde(rename = "type")]
|
|
field_type: String,
|
|
#[serde(default)]
|
|
extraction: ExtractionConfig,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Default)]
|
|
struct ExtractionConfig {
|
|
#[serde(default)]
|
|
patterns: Vec<String>,
|
|
#[serde(default)]
|
|
region_hint: Option<String>,
|
|
#[serde(default)]
|
|
table_region: Option<String>,
|
|
#[serde(default)]
|
|
columnar_regions: Option<String>,
|
|
#[serde(default)]
|
|
per_page: Option<bool>,
|
|
#[serde(default)]
|
|
#[allow(dead_code)]
|
|
fallback: serde_yaml::Value,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Default)]
|
|
struct MatchConfig {
|
|
#[serde(default)]
|
|
any: Vec<MatchClause>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Default)]
|
|
struct MatchClause {
|
|
#[serde(default)]
|
|
text_patterns: Vec<String>,
|
|
#[serde(default)]
|
|
structural: Vec<serde_yaml::Value>,
|
|
}
|
|
|
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
let args: Vec<String> = std::env::args().collect();
|
|
|
|
if args.len() < 2 {
|
|
eprintln!("Usage: xtask <command>");
|
|
eprintln!("Commands:");
|
|
eprintln!(" doc-profile <profile-name> Generate README skeleton for a profile");
|
|
eprintln!(" doc-profiles Generate README skeletons for all profiles");
|
|
eprintln!(" generate-stress-pdfs Generate stress-test PDFs for memory ceiling testing");
|
|
eprintln!(" generate-page-class-fixtures Generate page classification test fixtures");
|
|
eprintln!(" generate-brokenvector-fixtures Generate BrokenVector OCR test fixtures");
|
|
eprintln!(" generate-sensitive-fixture Generate password-protected PDF for TH-08 log audit test");
|
|
eprintln!(" gen-schema Generate JSON Schema from Rust output types");
|
|
eprintln!(" validate-schema Validate checked-in schema matches generated");
|
|
eprintln!(
|
|
" gen-shape-db Generate glyph shape database from font files"
|
|
);
|
|
eprintln!(" memory-ceiling Run memory ceiling tests against perf/malformed corpora");
|
|
std::process::exit(1);
|
|
}
|
|
|
|
let result = match args[1].as_str() {
|
|
"doc-profile" => {
|
|
if args.len() < 3 {
|
|
eprintln!("Usage: xtask doc-profile <profile-name>");
|
|
std::process::exit(1);
|
|
}
|
|
generate_profile_readme(&args[2])?;
|
|
Ok(())
|
|
}
|
|
"doc-profiles" => {
|
|
let profiles_dir = find_workspace_root().join("profiles/builtin");
|
|
for entry in fs::read_dir(&profiles_dir)? {
|
|
let entry = entry?;
|
|
if entry.path().is_dir() {
|
|
let profile_name = entry.file_name().to_string_lossy().to_string();
|
|
if let Err(e) = generate_profile_readme(&profile_name) {
|
|
eprintln!("Error generating README for {}: {}", profile_name, e);
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
"generate-stress-pdfs" => {
|
|
generate_stress_pdfs()?;
|
|
Ok(())
|
|
}
|
|
"generate-page-class-fixtures" => {
|
|
generate_page_class_fixtures()?;
|
|
Ok(())
|
|
}
|
|
"generate-brokenvector-fixtures" => {
|
|
generate_brokenvector_fixtures()?;
|
|
Ok(())
|
|
}
|
|
"generate-sensitive-fixture" => {
|
|
generate_sensitive_fixture()?;
|
|
Ok(())
|
|
}
|
|
"gen-schema" => {
|
|
gen_schema()?;
|
|
Ok(())
|
|
}
|
|
"validate-schema" => {
|
|
validate_schema()?;
|
|
Ok(())
|
|
}
|
|
"memory-ceiling" => {
|
|
run_memory_ceiling_tests()?;
|
|
Ok(())
|
|
}
|
|
"gen-shape-db" => {
|
|
let fonts_dir = if args.len() >= 3 {
|
|
args[2].clone()
|
|
} else {
|
|
eprintln!("Usage: xtask gen-shape-db <fonts-dir>");
|
|
std::process::exit(1);
|
|
};
|
|
let output_path = if args.len() >= 4 {
|
|
args[3].clone()
|
|
} else {
|
|
"build/glyph-shapes.json".to_string()
|
|
};
|
|
gen_shape_db(&fonts_dir, &output_path)?;
|
|
Ok(())
|
|
}
|
|
_ => {
|
|
eprintln!("Unknown command: {}", args[1]);
|
|
std::process::exit(1);
|
|
}
|
|
};
|
|
|
|
result
|
|
}
|
|
|
|
/// Generate JSON Schema from Rust output types.
|
|
///
|
|
/// Delegates to the gen_schema binary.
|
|
fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
|
|
// Invoke the gen_schema binary
|
|
let status = std::process::Command::new("cargo")
|
|
.args(["run", "--bin", "gen_schema"])
|
|
.current_dir(find_workspace_root())
|
|
.status()?;
|
|
|
|
if !status.success() {
|
|
return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Validate that the checked-in schema matches the generated schema.
|
|
///
|
|
/// Regenerates the schema and diffs against the checked-in version.
|
|
/// Fails if there's any difference, indicating the schema needs to be regenerated.
|
|
fn validate_schema() -> Result<(), Box<dyn std::error::Error>> {
|
|
use std::io::Write;
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json");
|
|
|
|
// Read the checked-in schema
|
|
let checked_in = fs::read_to_string(&schema_path)
|
|
.with_context(|| format!("Failed to read checked-in schema from {}", schema_path.display()))?;
|
|
|
|
// Generate the schema in memory
|
|
let generated = generate_schema()?;
|
|
|
|
// Compare
|
|
if checked_in == generated {
|
|
println!("✓ Schema is up-to-date: {}", schema_path.display());
|
|
Ok(())
|
|
} else {
|
|
eprintln!("✗ Schema drift detected: {}", schema_path.display());
|
|
eprintln!("");
|
|
eprintln!("The checked-in schema does not match the generated schema.");
|
|
eprintln!("Run 'cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema'");
|
|
eprintln!("to regenerate the schema and commit the changes.");
|
|
eprintln!("");
|
|
|
|
// Show a diff-like preview
|
|
let checked_in_lines: Vec<&str> = checked_in.lines().collect();
|
|
let generated_lines: Vec<&str> = generated.lines().collect();
|
|
|
|
let diff = diff_lines(&checked_in_lines, &generated_lines);
|
|
if !diff.is_empty() {
|
|
eprintln!("Differences (first 50 lines):");
|
|
eprintln!("{}", diff.lines().take(50).collect::<Vec<_>>().join("\n"));
|
|
}
|
|
|
|
Err("Schema drift detected".into())
|
|
}
|
|
}
|
|
|
|
/// Generate schema in memory (shared with gen_schema binary).
|
|
fn generate_schema() -> Result<String, Box<dyn std::error::Error>> {
|
|
use pdftract_core::schema::Output;
|
|
use schemars::schema_for;
|
|
use serde_json::Value;
|
|
|
|
let schema = schema_for!(Output);
|
|
let mut value = serde_json::to_value(&schema)?;
|
|
|
|
// Set $id, title, and description
|
|
if let Some(obj) = value.as_object_mut() {
|
|
obj.insert("$id".to_string(), Value::String(
|
|
"https://pdftract.com/schema/v1.0/pdftract.schema.json".to_string()
|
|
));
|
|
obj.insert("title".to_string(), Value::String(
|
|
"pdftract Output v1.0".to_string()
|
|
));
|
|
obj.insert("description".to_string(), Value::String(
|
|
"JSON Schema for pdftract PDF extraction output v1.0. \
|
|
This schema defines the structure of extraction results including pages, \
|
|
spans, blocks, tables, form fields, signatures, and metadata."
|
|
.to_string()
|
|
));
|
|
}
|
|
|
|
// Add enum constraints (copied from gen_schema.rs)
|
|
add_enum_constraints(&mut value);
|
|
|
|
// Sort for stable output
|
|
let sorted = sort_keys_recursive(value);
|
|
Ok(serde_json::to_string_pretty(&sorted)?)
|
|
}
|
|
|
|
/// Add enum constraints to schema (copied from gen_schema.rs).
|
|
fn add_enum_constraints(value: &mut Value) {
|
|
if let Some(obj) = value.as_object_mut() {
|
|
if let Some(defs) = obj.get_mut("$defs").and_then(|v| v.as_object_mut()) {
|
|
// DiagnosticJson.severity
|
|
if let Some(diag) = defs.get_mut("DiagnosticJson").and_then(|v| v.as_object_mut()) {
|
|
if let Some(props) = diag.get_mut("properties").and_then(|v| v.as_object_mut()) {
|
|
if let Some(severity) = props.get_mut("severity").and_then(|v| v.as_object_mut()) {
|
|
severity.insert("enum".to_string(), Value::Array(vec![
|
|
Value::String("info".to_string()),
|
|
Value::String("warning".to_string()),
|
|
Value::String("error".to_string()),
|
|
Value::String("fatal".to_string()),
|
|
]));
|
|
}
|
|
}
|
|
}
|
|
|
|
// PageJson.page_type
|
|
if let Some(page) = defs.get_mut("PageJson").and_then(|v| v.as_object_mut()) {
|
|
if let Some(props) = page.get_mut("properties").and_then(|v| v.as_object_mut()) {
|
|
if let Some(page_type) = props.get_mut("type").and_then(|v| v.as_object_mut()) {
|
|
page_type.insert("enum".to_string(), Value::Array(vec![
|
|
Value::String("text".to_string()),
|
|
Value::String("scanned".to_string()),
|
|
Value::String("mixed".to_string()),
|
|
Value::String("broken_vector".to_string()),
|
|
Value::String("blank".to_string()),
|
|
Value::String("figure_only".to_string()),
|
|
]));
|
|
}
|
|
}
|
|
}
|
|
|
|
// SpanJson.confidence_source
|
|
if let Some(span) = defs.get_mut("SpanJson").and_then(|v| v.as_object_mut()) {
|
|
if let Some(props) = span.get_mut("properties").and_then(|v| v.as_object_mut()) {
|
|
if let Some(conf_src) = props.get_mut("confidence_source").and_then(|v| v.as_object_mut()) {
|
|
conf_src.insert("enum".to_string(), Value::Array(vec![
|
|
Value::String("native".to_string()),
|
|
Value::String("heuristic".to_string()),
|
|
Value::String("ocr".to_string()),
|
|
]));
|
|
}
|
|
}
|
|
}
|
|
|
|
// AttachmentJson.data contentEncoding
|
|
if let Some(attachment) = defs.get_mut("AttachmentJson").and_then(|v| v.as_object_mut()) {
|
|
if let Some(props) = attachment.get_mut("properties").and_then(|v| v.as_object_mut()) {
|
|
if let Some(data) = props.get_mut("data").and_then(|v| v.as_object_mut()) {
|
|
data.insert("contentEncoding".to_string(), Value::String("base64".to_string()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Sort keys recursively (copied from gen_schema.rs).
|
|
fn sort_keys_recursive(value: Value) -> Value {
|
|
match value {
|
|
Value::Object(map) => {
|
|
let mut sorted = std::collections::BTreeMap::new();
|
|
for (k, v) in map {
|
|
sorted.insert(k, sort_keys_recursive(v));
|
|
}
|
|
Value::Object(sorted.into_iter().collect())
|
|
}
|
|
Value::Array(arr) => {
|
|
Value::Array(arr.into_iter().map(sort_keys_recursive).collect())
|
|
}
|
|
_ => value,
|
|
}
|
|
}
|
|
|
|
/// Generate a simple diff between two line sequences.
|
|
fn diff_lines(old: &[&str], new: &[&str]) -> String {
|
|
let mut output = String::new();
|
|
|
|
let max_lines = old.len().max(new.len());
|
|
for i in 0..max_lines {
|
|
let old_line = old.get(i);
|
|
let new_line = new.get(i);
|
|
|
|
match (old_line, new_line) {
|
|
(Some(o), Some(n)) if o != n => {
|
|
output.push_str(&format!("Line {}: '{} '{}' '{}' '{}'\n", i + 1, "-", o, "+", n));
|
|
}
|
|
(Some(o), None) => {
|
|
output.push_str(&format!("Line {}: '{} '{}'\n", i + 1, "-", o));
|
|
}
|
|
(None, Some(n)) => {
|
|
output.push_str(&format!("Line {}: '{} '{}'\n", i + 1, "+", n));
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
output
|
|
}
|
|
|
|
fn generate_profile_readme(profile_name: &str) -> Result<(), Box<dyn std::error::Error>> {
|
|
// Find the workspace root by looking for the parent directory's Cargo.toml
|
|
let workspace_root = find_workspace_root();
|
|
let profile_path = workspace_root
|
|
.join("profiles/builtin")
|
|
.join(profile_name)
|
|
.join("profile.yaml");
|
|
let readme_path = workspace_root
|
|
.join("profiles/builtin")
|
|
.join(profile_name)
|
|
.join("README.md");
|
|
|
|
if !profile_path.exists() {
|
|
return Err(format!("Profile YAML not found: {}", profile_path.display()).into());
|
|
}
|
|
|
|
let yaml_content = fs::read_to_string(&profile_path)?;
|
|
let profile: Profile = serde_yaml::from_str(&yaml_content)?;
|
|
|
|
let mut readme = String::new();
|
|
|
|
// Title and description
|
|
readme.push_str(&format!("# {} Profile\n\n", profile_name.to_uppercase()));
|
|
readme.push_str(&format!("{}\n\n", profile.description));
|
|
|
|
// Match Criteria Summary (placeholder for human to fill)
|
|
readme.push_str("## Match Criteria Summary\n\n");
|
|
readme.push_str("*This section describes the characteristics that cause a document to match this profile. The following signals are considered:*\n\n");
|
|
|
|
// Collect all text patterns and structural signals from any clause
|
|
let mut all_patterns: Vec<&String> = Vec::new();
|
|
let mut all_structural: Vec<String> = Vec::new();
|
|
|
|
for clause in &profile.r#match.any {
|
|
for pattern in &clause.text_patterns {
|
|
if !all_patterns.contains(&pattern) {
|
|
all_patterns.push(pattern);
|
|
}
|
|
}
|
|
for signal in &clause.structural {
|
|
let signal_str = format!("{:?}", signal);
|
|
if !all_structural.iter().any(|s| s == &signal_str) {
|
|
all_structural.push(signal_str);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Show first few patterns as examples
|
|
if !all_patterns.is_empty() {
|
|
let show_count = all_patterns.len().min(3);
|
|
readme.push_str("- **Text patterns**: ");
|
|
for (i, pattern) in all_patterns.iter().take(show_count).enumerate() {
|
|
if i > 0 {
|
|
readme.push_str(", ");
|
|
}
|
|
readme.push_str(&format!("`{}`", pattern));
|
|
}
|
|
if all_patterns.len() > show_count {
|
|
readme.push_str(&format!(" ({} more)", all_patterns.len() - show_count));
|
|
}
|
|
readme.push('\n');
|
|
}
|
|
|
|
if !all_structural.is_empty() {
|
|
let show_count = all_structural.len().min(3);
|
|
readme.push_str("- **Structural signals**: ");
|
|
for (i, signal) in all_structural.iter().take(show_count).enumerate() {
|
|
if i > 0 {
|
|
readme.push_str(", ");
|
|
}
|
|
readme.push_str(&format!("`{}`", signal));
|
|
}
|
|
if all_structural.len() > show_count {
|
|
readme.push_str(&format!(" ({} more)", all_structural.len() - show_count));
|
|
}
|
|
readme.push('\n');
|
|
}
|
|
|
|
readme.push_str(
|
|
"\n*Additional heuristics and confidence scoring are applied during classification.*\n\n",
|
|
);
|
|
|
|
// Extracted Fields
|
|
readme.push_str("## Extracted Fields\n\n");
|
|
readme.push_str("| Field | Type | Description | Example Value | Source Hint |\n");
|
|
readme.push_str("|-------|------|-------------|----------------|-------------|\n");
|
|
|
|
for (field_name, field) in &profile.profile_fields {
|
|
let description = "Extracted from page text using pattern matching".to_string();
|
|
let example = match field.field_type.as_str() {
|
|
"string" => "\"example value\"",
|
|
"decimal" => "123.45",
|
|
"date" => "2024-01-15",
|
|
"int" => "42",
|
|
"array" => "[...]",
|
|
_ => "N/A",
|
|
};
|
|
let mut source_parts = Vec::new();
|
|
if !field.extraction.patterns.is_empty() {
|
|
source_parts.push("regex patterns".to_string());
|
|
}
|
|
if let Some(ref hint) = field.extraction.region_hint {
|
|
source_parts.push(format!("region: {}", hint));
|
|
}
|
|
if let Some(ref table) = field.extraction.table_region {
|
|
source_parts.push(format!("table: {}", table));
|
|
}
|
|
if let Some(ref cols) = field.extraction.columnar_regions {
|
|
source_parts.push(format!("columns: {}", cols));
|
|
}
|
|
if field.extraction.per_page.unwrap_or(false) {
|
|
source_parts.push("per-page".to_string());
|
|
}
|
|
let source = if source_parts.is_empty() {
|
|
"profile YAML".to_string()
|
|
} else {
|
|
source_parts.join(", ")
|
|
};
|
|
readme.push_str(&format!(
|
|
"| {} | {} | {} | {} | {} |\n",
|
|
field_name, field.field_type, description, example, source
|
|
));
|
|
}
|
|
|
|
if profile.profile_fields.is_empty() {
|
|
readme.push_str("| *(none)* | - | *This profile has no field extractors* | - | - |\n");
|
|
}
|
|
|
|
readme.push('\n');
|
|
|
|
// Known Limitations
|
|
readme.push_str("## Known Limitations\n\n");
|
|
readme.push_str("*This section documents known edge cases and failure modes. Contributions to improve extraction quality are welcome.*\n\n");
|
|
readme.push_str("- *Document limitations and edge cases to be added by profile author*\n\n");
|
|
|
|
// Sample Input Pointer
|
|
readme.push_str("## Sample Input\n\n");
|
|
readme.push_str(&format!("Example fixtures demonstrating this profile are available in `tests/fixtures/profiles/{}/`.\n\n", profile_name));
|
|
readme.push_str("*See the classifier corpus for representative documents.*\n\n");
|
|
|
|
// Configuration Tips
|
|
readme.push_str("## Configuration Tips\n\n");
|
|
readme.push_str("To override this profile:\n\n");
|
|
readme.push_str("```bash\n");
|
|
readme.push_str(&format!(
|
|
"pdftract profiles export {} > my-profile.yaml\n",
|
|
profile_name
|
|
));
|
|
readme.push_str(
|
|
"# Edit my-profile.yaml to customize match criteria, fields, or extraction patterns\n",
|
|
);
|
|
readme.push_str("pdftract extract --profile my-profile.yaml document.pdf\n");
|
|
readme.push_str("```\n\n");
|
|
|
|
// Footer
|
|
readme.push_str("---\n\n*This README was auto-generated from `profile.yaml`. Update the Match Criteria Summary and Known Limitations sections with profile-specific guidance.*\n");
|
|
|
|
fs::write(&readme_path, readme)?;
|
|
println!(
|
|
"Generated README for {} at {}",
|
|
profile_name,
|
|
readme_path.display()
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate stress-test PDFs for memory ceiling testing
|
|
///
|
|
/// Creates large-page-count PDFs to validate memory targets:
|
|
/// - 100-page vector PDF for buffered mode testing (target: < 512 MB)
|
|
/// - 10,000-page stress test for streaming mode validation (target: < 256 MB)
|
|
fn generate_stress_pdfs() -> Result<(), Box<dyn std::error::Error>> {
|
|
println!("==========================================");
|
|
println!("Generating Stress-Test PDFs");
|
|
println!("==========================================");
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let perf_dir = workspace_root.join("tests/fixtures/perf");
|
|
fs::create_dir_all(&perf_dir)?;
|
|
|
|
let configs = vec![
|
|
(
|
|
100,
|
|
"100-page-vector.pdf",
|
|
"Buffered mode stress test (512 MB budget)",
|
|
),
|
|
(
|
|
10000,
|
|
"10k-page.pdf",
|
|
"Streaming mode stress test (256 MB budget)",
|
|
),
|
|
];
|
|
|
|
for (num_pages, filename, description) in &configs {
|
|
println!("\nGenerating: {} ({} pages)", filename, num_pages);
|
|
println!(" Purpose: {}", description);
|
|
|
|
let output_path = perf_dir.join(filename);
|
|
generate_stress_pdf(&output_path, *num_pages)?;
|
|
}
|
|
|
|
println!("\n==========================================");
|
|
println!("Stress-Test PDF Generation Complete");
|
|
println!("==========================================");
|
|
println!("\nGenerated files:");
|
|
for (_, filename, _) in &configs {
|
|
let path = perf_dir.join(filename);
|
|
if path.exists() {
|
|
let metadata = fs::metadata(&path)?;
|
|
let size_mb = metadata.len() as f64 / 1024.0 / 1024.0;
|
|
println!(" - {} ({:.2} MB)", filename, size_mb);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate a multi-page stress-test PDF
|
|
///
|
|
/// Creates a PDF with the specified number of pages for memory ceiling testing.
|
|
/// Uses a minimal approach with lopdf 0.34.
|
|
fn generate_stress_pdf(
|
|
output_path: &Path,
|
|
num_pages: usize,
|
|
) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Pre-create fonts and resources that will be reused
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
|
|
// Create all page objects first
|
|
let mut page_ids = Vec::new();
|
|
let mediabox = Object::Array(vec![
|
|
Object::Real(0.0),
|
|
Object::Real(0.0),
|
|
Object::Real(612.0),
|
|
Object::Real(792.0),
|
|
]);
|
|
|
|
for page_num in 1..=num_pages {
|
|
// Create content stream for this page
|
|
let content_bytes = format!(
|
|
"BT /F1 12 Tf 72 720 Td (Page {} of {}) Tj ET",
|
|
page_num, num_pages
|
|
)
|
|
.into_bytes();
|
|
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes);
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Create page dictionary
|
|
let mut page_dict = Dictionary::new();
|
|
page_dict.set("Type", "Page");
|
|
page_dict.set("MediaBox", mediabox.clone());
|
|
page_dict.set("Contents", content_id);
|
|
page_dict.set("Resources", resources.clone());
|
|
|
|
let page_id = doc.add_object(page_dict);
|
|
page_ids.push(page_id);
|
|
}
|
|
|
|
// Create the Pages root dictionary (Pages tree)
|
|
let mut pages_dict = Dictionary::new();
|
|
pages_dict.set("Type", "Pages");
|
|
pages_dict.set("Count", Object::Integer(num_pages as i64));
|
|
pages_dict.set(
|
|
"Kids",
|
|
Object::Array(page_ids.iter().map(|&id| Object::Reference(id)).collect()),
|
|
);
|
|
|
|
let pages_id = doc.add_object(pages_dict);
|
|
|
|
// Set Parent reference for each page
|
|
for &page_id in &page_ids {
|
|
let page_obj = doc.get_object(page_id)?;
|
|
if let Ok(dict) = page_obj.as_dict() {
|
|
let mut updated_dict = dict.clone();
|
|
updated_dict.set("Parent", pages_id);
|
|
// Need to replace the object
|
|
let _ = doc
|
|
.objects
|
|
.insert(page_id, Object::Dictionary(updated_dict));
|
|
}
|
|
}
|
|
|
|
// Create the Catalog dictionary
|
|
let mut catalog_dict = Dictionary::new();
|
|
catalog_dict.set("Type", "Catalog");
|
|
catalog_dict.set("Pages", pages_id);
|
|
let catalog_id = doc.add_object(catalog_dict);
|
|
|
|
// Set the document's catalog ID directly
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save the document
|
|
doc.save(output_path)?;
|
|
|
|
let metadata = fs::metadata(output_path)?;
|
|
let size_mb = metadata.len() as f64 / 1024.0 / 1024.0;
|
|
println!(
|
|
" Generated: {} ({:.2} MB)",
|
|
output_path.file_name().unwrap().to_string_lossy(),
|
|
size_mb
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Memory budgets for different document categories (in MB)
|
|
#[derive(Debug, Clone)]
|
|
struct MemoryBudget {
|
|
pub buffered_100_page: usize, // 512 MB
|
|
pub streaming_any: usize, // 256 MB
|
|
pub adversarial_hard_cap: usize, // 1 GB
|
|
}
|
|
|
|
impl Default for MemoryBudget {
|
|
fn default() -> Self {
|
|
Self {
|
|
buffered_100_page: 512,
|
|
streaming_any: 256,
|
|
adversarial_hard_cap: 1024,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
struct MemoryMeasurement {
|
|
pub peak_rss_mb: usize,
|
|
pub duration_ms: u128,
|
|
pub succeeded: bool,
|
|
pub error_message: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
struct MemoryTestResult {
|
|
pub file_name: String,
|
|
pub category: String, // "buffered", "streaming", "adversarial"
|
|
pub peak_rss_mb: usize,
|
|
pub duration_ms: u128,
|
|
pub budget_mb: usize,
|
|
pub passed: bool,
|
|
pub error_message: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
struct MemoryReport {
|
|
pub timestamp: String,
|
|
pub commit_sha: Option<String>,
|
|
pub budgets: MemoryBudgetJson,
|
|
pub results: Vec<MemoryTestResult>,
|
|
pub summary: MemorySummary,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
struct MemoryBudgetJson {
|
|
pub buffered_100_page_mb: usize,
|
|
pub streaming_any_mb: usize,
|
|
pub adversarial_hard_cap_mb: usize,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
struct MemorySummary {
|
|
pub total_tests: usize,
|
|
pub passed: usize,
|
|
pub failed: usize,
|
|
pub all_passed: bool,
|
|
}
|
|
|
|
/// Run memory ceiling tests against perf and malformed corpora
|
|
///
|
|
/// This enforces the Tier-1 Memory targets from the plan:
|
|
/// - Peak RSS, 100-page vector PDF (buffered mode) < 512 MB
|
|
/// - Peak RSS, streaming/NDJSON mode < 256 MB
|
|
/// - Peak RSS, adversarial fixtures < 1 GB hard ceiling
|
|
///
|
|
/// Analogous to cargo-bloat for memory usage: fails the build if any
|
|
/// document exceeds its budget.
|
|
///
|
|
/// Generates memory-report.json artifact for CI historical tracking.
|
|
fn run_memory_ceiling_tests() -> Result<(), Box<dyn std::error::Error>> {
|
|
println!("==========================================");
|
|
println!("Memory Ceiling Tests");
|
|
println!("==========================================");
|
|
|
|
let budgets = MemoryBudget::default();
|
|
let workspace_root = find_workspace_root();
|
|
let perf_dir = workspace_root.join("tests/fixtures/perf");
|
|
let malformed_dir = workspace_root.join("tests/fixtures/malformed");
|
|
|
|
println!("\nMemory budgets:");
|
|
println!(" - Buffered 100-page: {} MB", budgets.buffered_100_page);
|
|
println!(" - Streaming mode: {} MB", budgets.streaming_any);
|
|
println!(
|
|
" - Adversarial hard cap: {} MB",
|
|
budgets.adversarial_hard_cap
|
|
);
|
|
|
|
// Build pdftract binary first
|
|
println!("\n=== Building pdftract for testing ===");
|
|
let build_status = Command::new("cargo")
|
|
.args(["build", "--release", "--bin", "pdftract", "--locked"])
|
|
.current_dir(&workspace_root)
|
|
.stdout(Stdio::inherit())
|
|
.stderr(Stdio::inherit())
|
|
.status()?;
|
|
|
|
if !build_status.success() {
|
|
return Err("Failed to build pdftract binary".into());
|
|
}
|
|
|
|
let binary_path = workspace_root.join("target/release/pdftract");
|
|
if !binary_path.exists() {
|
|
return Err(format!("pdftract binary not found at {}", binary_path.display()).into());
|
|
}
|
|
|
|
println!("Binary: {}", binary_path.display());
|
|
|
|
let mut all_results = Vec::new();
|
|
let mut all_passed = true;
|
|
|
|
// Test 1: Perf corpus - buffered mode (512 MB budget)
|
|
println!(
|
|
"\n=== Testing perf corpus (buffered mode, budget: {} MB) ===",
|
|
budgets.buffered_100_page
|
|
);
|
|
|
|
if perf_dir.exists() {
|
|
for entry in fs::read_dir(&perf_dir)? {
|
|
let entry = entry?;
|
|
let path = entry.path();
|
|
|
|
if path.extension().and_then(|s| s.to_str()) != Some("pdf") {
|
|
continue;
|
|
}
|
|
|
|
let file_name = path.file_name().unwrap().to_string_lossy().to_string();
|
|
print!(" [buffered] {} ... ", file_name);
|
|
|
|
match measure_extraction(&binary_path, &path, &budgets, false) {
|
|
Ok(measurement) => {
|
|
let passed = measurement.peak_rss_mb <= budgets.buffered_100_page;
|
|
if passed {
|
|
println!(
|
|
"PASS ({} MB, {} ms)",
|
|
measurement.peak_rss_mb, measurement.duration_ms
|
|
);
|
|
} else {
|
|
println!(
|
|
"FAIL ({} MB > {} MB)",
|
|
measurement.peak_rss_mb, budgets.buffered_100_page
|
|
);
|
|
all_passed = false;
|
|
}
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "buffered".to_string(),
|
|
peak_rss_mb: measurement.peak_rss_mb,
|
|
duration_ms: measurement.duration_ms,
|
|
budget_mb: budgets.buffered_100_page,
|
|
passed,
|
|
error_message: measurement.error_message,
|
|
});
|
|
}
|
|
Err(e) => {
|
|
println!("ERROR ({})", e);
|
|
all_passed = false;
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "buffered".to_string(),
|
|
peak_rss_mb: 0,
|
|
duration_ms: 0,
|
|
budget_mb: budgets.buffered_100_page,
|
|
passed: false,
|
|
error_message: Some(e.to_string()),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
println!(" (no perf directory)");
|
|
}
|
|
|
|
// Test 2: Perf corpus - streaming mode (256 MB budget)
|
|
println!(
|
|
"\n=== Testing perf corpus (streaming mode, budget: {} MB) ===",
|
|
budgets.streaming_any
|
|
);
|
|
|
|
if perf_dir.exists() {
|
|
for entry in fs::read_dir(&perf_dir)? {
|
|
let entry = entry?;
|
|
let path = entry.path();
|
|
|
|
if path.extension().and_then(|s| s.to_str()) != Some("pdf") {
|
|
continue;
|
|
}
|
|
|
|
let file_name = path.file_name().unwrap().to_string_lossy().to_string();
|
|
print!(" [streaming] {} ... ", file_name);
|
|
|
|
match measure_extraction(&binary_path, &path, &budgets, true) {
|
|
Ok(measurement) => {
|
|
let passed = measurement.peak_rss_mb <= budgets.streaming_any;
|
|
if passed {
|
|
println!(
|
|
"PASS ({} MB, {} ms)",
|
|
measurement.peak_rss_mb, measurement.duration_ms
|
|
);
|
|
} else {
|
|
println!(
|
|
"FAIL ({} MB > {} MB)",
|
|
measurement.peak_rss_mb, budgets.streaming_any
|
|
);
|
|
all_passed = false;
|
|
}
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "streaming".to_string(),
|
|
peak_rss_mb: measurement.peak_rss_mb,
|
|
duration_ms: measurement.duration_ms,
|
|
budget_mb: budgets.streaming_any,
|
|
passed,
|
|
error_message: measurement.error_message,
|
|
});
|
|
}
|
|
Err(e) => {
|
|
println!("ERROR ({})", e);
|
|
all_passed = false;
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "streaming".to_string(),
|
|
peak_rss_mb: 0,
|
|
duration_ms: 0,
|
|
budget_mb: budgets.streaming_any,
|
|
passed: false,
|
|
error_message: Some(e.to_string()),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Test 3: Malformed corpus - adversarial hard cap (1 GB budget)
|
|
println!(
|
|
"\n=== Testing malformed corpus (adversarial hard cap: {} MB) ===",
|
|
budgets.adversarial_hard_cap
|
|
);
|
|
|
|
if malformed_dir.exists() {
|
|
for entry in fs::read_dir(&malformed_dir)? {
|
|
let entry = entry?;
|
|
let path = entry.path();
|
|
|
|
if path.extension().and_then(|s| s.to_str()) != Some("pdf")
|
|
&& path.extension().and_then(|s| s.to_str()) != Some("bin")
|
|
{
|
|
continue;
|
|
}
|
|
|
|
let file_name = path.file_name().unwrap().to_string_lossy().to_string();
|
|
print!(" [adversarial] {} ... ", file_name);
|
|
|
|
match measure_extraction(&binary_path, &path, &budgets, false) {
|
|
Ok(measurement) => {
|
|
let passed = measurement.peak_rss_mb <= budgets.adversarial_hard_cap;
|
|
if passed {
|
|
println!(
|
|
"PASS ({} MB, {} ms)",
|
|
measurement.peak_rss_mb, measurement.duration_ms
|
|
);
|
|
} else {
|
|
println!(
|
|
"FAIL ({} MB > {} MB)",
|
|
measurement.peak_rss_mb, budgets.adversarial_hard_cap
|
|
);
|
|
all_passed = false;
|
|
}
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "adversarial".to_string(),
|
|
peak_rss_mb: measurement.peak_rss_mb,
|
|
duration_ms: measurement.duration_ms,
|
|
budget_mb: budgets.adversarial_hard_cap,
|
|
passed,
|
|
error_message: measurement.error_message,
|
|
});
|
|
}
|
|
Err(e) => {
|
|
println!("ERROR ({})", e);
|
|
all_passed = false;
|
|
all_results.push(MemoryTestResult {
|
|
file_name: file_name.clone(),
|
|
category: "adversarial".to_string(),
|
|
peak_rss_mb: 0,
|
|
duration_ms: 0,
|
|
budget_mb: budgets.adversarial_hard_cap,
|
|
passed: false,
|
|
error_message: Some(e.to_string()),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
println!(" (no malformed directory)");
|
|
}
|
|
|
|
// Print summary
|
|
println!("\n==========================================");
|
|
println!("Memory Ceiling Summary");
|
|
println!("==========================================");
|
|
|
|
let passed_count = all_results.iter().filter(|r| r.passed).count();
|
|
let total_count = all_results.len();
|
|
|
|
println!("Passed: {}/{}", passed_count, total_count);
|
|
|
|
if !all_passed {
|
|
println!("\nFailed documents:");
|
|
for result in &all_results {
|
|
if !result.passed {
|
|
if result.peak_rss_mb > 0 {
|
|
println!(
|
|
" - [{}] {} ({} MB > {} MB)",
|
|
result.category, result.file_name, result.peak_rss_mb, result.budget_mb
|
|
);
|
|
} else {
|
|
println!(
|
|
" - [{}] {} (error: {})",
|
|
result.category,
|
|
result.file_name,
|
|
result.error_message.as_deref().unwrap_or("unknown")
|
|
);
|
|
}
|
|
}
|
|
}
|
|
println!("\nMemory ceiling gate FAILED!");
|
|
return Err("Memory ceiling exceeded".into());
|
|
}
|
|
|
|
println!("\nMemory ceiling gate PASSED!");
|
|
|
|
// Generate JSON report
|
|
let report = MemoryReport {
|
|
timestamp: format!(
|
|
"{}",
|
|
humantime::format_rfc3339_seconds(std::time::SystemTime::now())
|
|
),
|
|
commit_sha: get_commit_sha()?,
|
|
budgets: MemoryBudgetJson {
|
|
buffered_100_page_mb: budgets.buffered_100_page,
|
|
streaming_any_mb: budgets.streaming_any,
|
|
adversarial_hard_cap_mb: budgets.adversarial_hard_cap,
|
|
},
|
|
results: all_results.clone(),
|
|
summary: MemorySummary {
|
|
total_tests: total_count,
|
|
passed: passed_count,
|
|
failed: total_count - passed_count,
|
|
all_passed,
|
|
},
|
|
};
|
|
|
|
let report_path = workspace_root.join("memory-report.json");
|
|
fs::write(&report_path, serde_json::to_string_pretty(&report)?)?;
|
|
println!("\nReport written to: {}", report_path.display());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the current git commit SHA
|
|
fn get_commit_sha() -> Result<Option<String>, Box<dyn std::error::Error>> {
|
|
let workspace_root = find_workspace_root();
|
|
let output = Command::new("git")
|
|
.args(["rev-parse", "HEAD"])
|
|
.current_dir(&workspace_root)
|
|
.output()?;
|
|
|
|
if output.status.success() {
|
|
let sha = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
|
Ok(Some(sha))
|
|
} else {
|
|
Ok(None)
|
|
}
|
|
}
|
|
|
|
/// Measure memory usage during extraction of a PDF file
|
|
///
|
|
/// Uses Linux-specific /proc/[pid]/status to sample peak RSS.
|
|
/// Falls back to time measurement if RSS sampling is unavailable.
|
|
///
|
|
/// # Arguments
|
|
/// * `binary_path` - Path to the pdftract binary
|
|
/// * `pdf_path` - Path to the PDF file to extract
|
|
/// * `budgets` - Memory budgets (unused but kept for compatibility)
|
|
/// * `streaming` - If true, use streaming/text mode for lower memory; otherwise buffered JSON mode
|
|
fn measure_extraction(
|
|
binary_path: &Path,
|
|
pdf_path: &Path,
|
|
_budgets: &MemoryBudget,
|
|
streaming: bool,
|
|
) -> Result<MemoryMeasurement, Box<dyn std::error::Error>> {
|
|
let start = Instant::now();
|
|
|
|
// Spawn the extraction process and measure its peak RSS
|
|
#[cfg(target_os = "linux")]
|
|
{
|
|
use std::os::unix::process::CommandExt;
|
|
|
|
let mut cmd = Command::new(binary_path);
|
|
|
|
if streaming {
|
|
// Streaming mode: use --format text for lower memory footprint
|
|
// Note: --format ndjson is not yet exposed in CLI (Phase 6.2)
|
|
// Using text format as a reasonable proxy for streaming memory behavior
|
|
cmd.arg("extract").arg("--format").arg("text");
|
|
} else {
|
|
// Buffered mode: use --format json for full document buffering
|
|
cmd.arg("extract").arg("--format").arg("json");
|
|
}
|
|
|
|
cmd.arg(pdf_path)
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::piped())
|
|
.process_group(0);
|
|
|
|
let mut child = cmd.spawn()?;
|
|
|
|
let pid = child.id();
|
|
let mut peak_rss_kb = 0usize;
|
|
|
|
// Sample RSS every 10ms while process runs
|
|
let sample_interval = Duration::from_millis(10);
|
|
loop {
|
|
// Try to wait for the process (non-blocking)
|
|
match child.try_wait() {
|
|
Ok(Some(status)) => {
|
|
// Process has exited
|
|
let duration = start.elapsed();
|
|
|
|
// Capture stderr for error messages
|
|
let stderr_output = if let Some(mut stderr) = child.stderr {
|
|
let mut error_text = String::new();
|
|
use std::io::Read;
|
|
let _ = stderr.read_to_string(&mut error_text);
|
|
error_text
|
|
} else {
|
|
String::new()
|
|
};
|
|
|
|
// Trim error text and use it if non-empty
|
|
let error_message = if !status.success() {
|
|
if !stderr_output.is_empty() {
|
|
Some(stderr_output.trim().to_string())
|
|
} else {
|
|
Some(format!("exit code: {:?}", status.code()))
|
|
}
|
|
} else {
|
|
None
|
|
};
|
|
|
|
return Ok(MemoryMeasurement {
|
|
peak_rss_mb: peak_rss_kb / 1024,
|
|
duration_ms: duration.as_millis(),
|
|
succeeded: status.success(),
|
|
error_message,
|
|
});
|
|
}
|
|
Ok(None) => {
|
|
// Process still running, sample RSS
|
|
if let Ok(rss_kb) = sample_rss(pid) {
|
|
peak_rss_kb = peak_rss_kb.max(rss_kb);
|
|
}
|
|
std::thread::sleep(sample_interval);
|
|
}
|
|
Err(e) => {
|
|
return Err(format!("Failed to wait for process: {}", e).into());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback for non-Linux platforms
|
|
#[cfg(not(target_os = "linux"))]
|
|
{
|
|
let mut cmd = Command::new(binary_path);
|
|
|
|
if streaming {
|
|
cmd.arg("extract").arg("--format").arg("text");
|
|
} else {
|
|
cmd.arg("extract").arg("--format").arg("json");
|
|
}
|
|
|
|
cmd.arg(pdf_path)
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::piped());
|
|
|
|
let output = cmd.output()?;
|
|
|
|
let duration = start.elapsed();
|
|
|
|
Ok(MemoryMeasurement {
|
|
peak_rss_mb: 0, // Cannot measure on this platform
|
|
duration_ms: duration.as_millis(),
|
|
succeeded: output.status.success(),
|
|
error_message: if !output.status.success() {
|
|
Some(format!("exit code: {:?}", output.status.code()))
|
|
} else {
|
|
None
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Sample the current RSS (Resident Set Size) of a process in KB
|
|
#[cfg(target_os = "linux")]
|
|
fn sample_rss(pid: u32) -> Result<usize, Box<dyn std::error::Error>> {
|
|
let status_path = format!("/proc/{}/status", pid);
|
|
let status = fs::read_to_string(&status_path)?;
|
|
|
|
// Parse VmRSS from /proc/[pid]/status
|
|
// Format: VmRSS: 12345 kB
|
|
for line in status.lines() {
|
|
if line.starts_with("VmRSS:") {
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 2 {
|
|
let rss_kb = parts[1].parse::<usize>()?;
|
|
return Ok(rss_kb);
|
|
}
|
|
}
|
|
}
|
|
|
|
Err("VmRSS not found in /proc status".into())
|
|
}
|
|
|
|
/// Generate page classification test fixtures
|
|
///
|
|
/// Creates 4 fixture types for testing page classification:
|
|
/// - vector_pure: Pure text PDF (born-digital)
|
|
/// - scanned_single: Image-only PDF (scanned page)
|
|
/// - brokenvector_pdfa: Invisible text layer over scanned image
|
|
/// - hybrid_header_body: Text header + scanned body
|
|
fn generate_page_class_fixtures() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
|
println!("==========================================");
|
|
println!("Generating Page Classification Fixtures");
|
|
println!("==========================================");
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let fixtures_dir = workspace_root.join("tests/fixtures/page_class");
|
|
fs::create_dir_all(&fixtures_dir)?;
|
|
|
|
// 1. Vector pure: Born-digital text PDF
|
|
println!("\n1. Generating vector_pure fixture...");
|
|
let vector_dir = fixtures_dir.join("vector_pure");
|
|
fs::create_dir_all(&vector_dir)?;
|
|
generate_vector_pure_pdf(&vector_dir)?;
|
|
|
|
// 2. Scanned single: Image-only PDF
|
|
println!("2. Generating scanned_single fixture...");
|
|
let scanned_dir = fixtures_dir.join("scanned_single");
|
|
fs::create_dir_all(&scanned_dir)?;
|
|
generate_scanned_single_pdf(&scanned_dir)?;
|
|
|
|
// 3. BrokenVector: Invisible text + image
|
|
println!("3. Generating brokenvector_pdfa fixture...");
|
|
let broken_dir = fixtures_dir.join("brokenvector_pdfa");
|
|
fs::create_dir_all(&broken_dir)?;
|
|
generate_brokenvector_pdf(&broken_dir)?;
|
|
|
|
// 4. Hybrid: Text header + scanned body
|
|
println!("4. Generating hybrid_header_body fixture...");
|
|
let hybrid_dir = fixtures_dir.join("hybrid_header_body");
|
|
fs::create_dir_all(&hybrid_dir)?;
|
|
generate_hybrid_pdf(&hybrid_dir)?;
|
|
|
|
println!("\n==========================================");
|
|
println!("Page Classification Fixtures Generated");
|
|
println!("==========================================");
|
|
|
|
// Print sizes
|
|
for fixture_name in &[
|
|
"vector_pure",
|
|
"scanned_single",
|
|
"brokenvector_pdfa",
|
|
"hybrid_header_body",
|
|
] {
|
|
let fixture_dir = fixtures_dir.join(fixture_name);
|
|
let pdf_path = fixture_dir.join("source.pdf");
|
|
if let Ok(metadata) = fs::metadata(&pdf_path) {
|
|
let size_kb = metadata.len() as f64 / 1024.0;
|
|
println!(" - {}/source.pdf: {:.2} KB", fixture_name, size_kb);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate a pure vector PDF (born-digital text)
|
|
fn generate_vector_pure_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Create font
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
// Resources
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
|
|
// Content stream: Multiple lines of text with high character count
|
|
let content_text = r#"
|
|
BT /F1 12 Tf 50 750 Td
|
|
(This is a born-digital PDF with pure vector text.) Tj
|
|
0 -15 Td (It contains multiple text operators and high character validity.) Tj
|
|
0 -15 Td (The classification should detect this as a Vector page.) Tj
|
|
0 -15 Td (Lorem ipsum dolor sit amet, consectetur adipiscing elit.) Tj
|
|
0 -15 Td (Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.) Tj
|
|
0 -15 Td (Ut enim ad minim veniam, quis nostrud exercitation ullamco.) Tj
|
|
0 -15 Td (Duis aute irure dolor in reprehenderit in voluptate velit esse.) Tj
|
|
0 -15 Td (Excepteur sint occaecat cupidatat non proident sunt in culpa.) Tj
|
|
ET
|
|
"#;
|
|
|
|
let content_bytes = content_text.as_bytes();
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Contents" => content_id,
|
|
"Resources" => resources,
|
|
"CropBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save PDF
|
|
let pdf_path = dir.join("source.pdf");
|
|
doc.save(&pdf_path)?;
|
|
|
|
// Generate expected.json
|
|
let expected = PageClassExpected {
|
|
class: "Vector".to_string(),
|
|
confidence_min: 0.90,
|
|
hybrid_cells: None,
|
|
};
|
|
let json_path = dir.join("expected.json");
|
|
fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;
|
|
|
|
println!(
|
|
" Created: {}/source.pdf ({:.2} KB)",
|
|
dir.file_name().unwrap().to_string_lossy(),
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate an image-only scanned PDF
|
|
fn generate_scanned_single_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Create a simple 1x1 pixel white image (minimal image object)
|
|
let image_data = vec![0u8; 4]; // 1x1 white pixel in RGB
|
|
let image_stream = Stream::new(
|
|
dictionary! {
|
|
"Type" => "XObject",
|
|
"Subtype" => "Image",
|
|
"Width" => 1,
|
|
"Height" => 1,
|
|
"BitsPerComponent" => 8,
|
|
"ColorSpace" => "DeviceRGB",
|
|
"Length" => image_data.len() as i32,
|
|
},
|
|
image_data,
|
|
);
|
|
let image_id = doc.add_object(image_stream);
|
|
|
|
// Resources with image
|
|
let mut resources = Dictionary::new();
|
|
let mut xobject = Dictionary::new();
|
|
xobject.set("Im1", image_id);
|
|
resources.set("XObject", xobject);
|
|
|
|
// Content stream: Draw image covering most of the page
|
|
let content_text = r#"
|
|
q 612 792 scale
|
|
/Im1 Do
|
|
Q
|
|
"#;
|
|
|
|
let content_bytes = content_text.as_bytes();
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Contents" => content_id,
|
|
"Resources" => resources,
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save PDF
|
|
let pdf_path = dir.join("source.pdf");
|
|
doc.save(&pdf_path)?;
|
|
|
|
// Generate expected.json
|
|
let expected = PageClassExpected {
|
|
class: "Scanned".to_string(),
|
|
confidence_min: 0.90,
|
|
hybrid_cells: None,
|
|
};
|
|
let json_path = dir.join("expected.json");
|
|
fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;
|
|
|
|
println!(
|
|
" Created: {}/source.pdf ({:.2} KB)",
|
|
dir.file_name().unwrap().to_string_lossy(),
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate a BrokenVector PDF (invisible text + image)
|
|
fn generate_brokenvector_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Create font
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
// Create a 1x1 white pixel image
|
|
let image_data = vec![255u8; 4];
|
|
let image_stream = Stream::new(
|
|
dictionary! {
|
|
"Type" => "XObject",
|
|
"Subtype" => "Image",
|
|
"Width" => 1,
|
|
"Height" => 1,
|
|
"BitsPerComponent" => 8,
|
|
"ColorSpace" => "DeviceRGB",
|
|
"Length" => image_data.len() as i32,
|
|
},
|
|
image_data,
|
|
);
|
|
let image_id = doc.add_object(image_stream);
|
|
|
|
// Resources
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
let mut xobject = Dictionary::new();
|
|
xobject.set("Im1", image_id);
|
|
resources.set("XObject", xobject);
|
|
|
|
// Content stream: Invisible text (Tr=3) + full-page image
|
|
// The text is there but invisible, simulating a bad OCR overlay
|
|
let content_text = r#"
|
|
BT /F1 12 Tf 50 750 Td 3 Tr
|
|
(This text is invisible Tr=3 overlay over scanned image.) Tj
|
|
0 -15 Td (It represents a broken vector PDF with bad OCR layer.) Tj
|
|
0 -15 Td (Classification should detect this as BrokenVector.) Tj
|
|
ET
|
|
q 612 792 scale
|
|
/Im1 Do
|
|
Q
|
|
"#;
|
|
|
|
let content_bytes = content_text.as_bytes();
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Contents" => content_id,
|
|
"Resources" => resources,
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save PDF
|
|
let pdf_path = dir.join("source.pdf");
|
|
doc.save(&pdf_path)?;
|
|
|
|
// Generate expected.json
|
|
let expected = PageClassExpected {
|
|
class: "BrokenVector".to_string(),
|
|
confidence_min: 0.90,
|
|
hybrid_cells: None,
|
|
};
|
|
let json_path = dir.join("expected.json");
|
|
fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;
|
|
|
|
println!(
|
|
" Created: {}/source.pdf ({:.2} KB)",
|
|
dir.file_name().unwrap().to_string_lossy(),
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate a Hybrid PDF (text header + scanned body)
|
|
fn generate_hybrid_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Create font
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
// Create a 1x1 white pixel image for the body
|
|
let image_data = vec![255u8; 4];
|
|
let image_stream = Stream::new(
|
|
dictionary! {
|
|
"Type" => "XObject",
|
|
"Subtype" => "Image",
|
|
"Width" => 1,
|
|
"Height" => 1,
|
|
"BitsPerComponent" => 8,
|
|
"ColorSpace" => "DeviceRGB",
|
|
"Length" => image_data.len() as i32,
|
|
},
|
|
image_data,
|
|
);
|
|
let image_id = doc.add_object(image_stream);
|
|
|
|
// Resources
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
let mut xobject = Dictionary::new();
|
|
xobject.set("Im1", image_id);
|
|
resources.set("XObject", xobject);
|
|
|
|
// Content stream: Text header (top 25%) + image body (bottom 75%)
|
|
// Header: visible text in the top portion
|
|
// Body: image covering the bottom portion
|
|
let content_text = r#"
|
|
BT /F1 14 Tf 50 750 Td
|
|
(This is a HYBRID document with vector text header) Tj
|
|
0 -20 Td (The header contains selectable text) Tj
|
|
0 -20 Td (Below this header is a scanned image body) Tj
|
|
ET
|
|
q
|
|
0 0 612 560 re W n
|
|
612 792 scale
|
|
/Im1 Do
|
|
Q
|
|
"#;
|
|
|
|
let content_bytes = content_text.as_bytes();
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Contents" => content_id,
|
|
"Resources" => resources,
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save PDF
|
|
let pdf_path = dir.join("source.pdf");
|
|
doc.save(&pdf_path)?;
|
|
|
|
// Generate expected.json
|
|
// For hybrid, we expect specific hybrid_cells (bottom rows of the 8x8 grid)
|
|
// The image covers bottom 75% of page, which corresponds to rows 2-7 (6 rows = 48 cells)
|
|
let hybrid_cells: Vec<usize> = (16..64).collect(); // rows 2-7
|
|
|
|
let expected = PageClassExpected {
|
|
class: "Hybrid".to_string(),
|
|
confidence_min: 0.15,
|
|
hybrid_cells: Some(hybrid_cells),
|
|
};
|
|
let json_path = dir.join("expected.json");
|
|
fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;
|
|
|
|
println!(
|
|
" Created: {}/source.pdf ({:.2} KB)",
|
|
dir.file_name().unwrap().to_string_lossy(),
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Generate BrokenVector OCR test fixtures for assisted-OCR testing.
|
|
///
|
|
/// This function creates two PDF/A fixtures:
|
|
/// 1. Aligned: Text layer at correct positions (assisted OCR should outperform blind OCR)
|
|
/// 2. Misaligned: Text layer offset by (10pt, 5pt) (assisted OCR should not regress)
|
|
///
|
|
/// Each fixture includes:
|
|
/// - A visible scan image (Lorem Ipsum text at 300 DPI)
|
|
/// - An invisible text layer (Tr=3) with controllable positioning
|
|
/// - Ground truth text file
|
|
fn generate_brokenvector_fixtures() -> Result<(), Box<dyn std::error::Error>> {
|
|
println!("==========================================");
|
|
println!("Generating BrokenVector OCR Fixtures");
|
|
println!("==========================================");
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let fixtures_dir = workspace_root.join("tests/fixtures/ocr");
|
|
fs::create_dir_all(&fixtures_dir)?;
|
|
|
|
let lorem_ipsum = r#"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
|
|
|
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
|
|
|
The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs. How vexingly quick daft zebras jump!
|
|
|
|
Sphinx of black quartz, judge my vow. The five boxing wizards jump quickly."#;
|
|
|
|
// 1. Generate aligned fixture
|
|
println!("\n1. Generating aligned BrokenVector fixture...");
|
|
let aligned_dir = fixtures_dir.join("brokenvector_aligned");
|
|
fs::create_dir_all(&aligned_dir)?;
|
|
|
|
// Create ground truth
|
|
let gt_path = aligned_dir.join("ground_truth.txt");
|
|
fs::write(>_path, lorem_ipsum.trim())?;
|
|
|
|
// Create PDF with invisible text layer at correct positions
|
|
let pdf_path = aligned_dir.join("source.pdf");
|
|
create_brokenvector_pdf(&pdf_path, lorem_ipsum, 0.0, 0.0)?;
|
|
println!(
|
|
" Created: brokenvector_aligned/source.pdf ({:.2} KB)",
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
// 2. Generate misaligned fixture
|
|
println!("\n2. Generating misaligned BrokenVector fixture...");
|
|
let misaligned_dir = fixtures_dir.join("brokenvector_misaligned");
|
|
fs::create_dir_all(&misaligned_dir)?;
|
|
|
|
// Create ground truth
|
|
let gt_path = misaligned_dir.join("ground_truth.txt");
|
|
fs::write(>_path, lorem_ipsum.trim())?;
|
|
|
|
// Create PDF with invisible text layer offset by (10pt, 5pt)
|
|
let pdf_path = misaligned_dir.join("source.pdf");
|
|
create_brokenvector_pdf(&pdf_path, lorem_ipsum, 10.0, 5.0)?;
|
|
println!(
|
|
" Created: brokenvector_misaligned/source.pdf ({:.2} KB)",
|
|
fs::metadata(&pdf_path)?.len() as f64 / 1024.0
|
|
);
|
|
|
|
println!("\n==========================================");
|
|
println!("BrokenVector OCR Fixtures Generated");
|
|
println!("==========================================");
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Create a BrokenVector PDF with invisible text layer.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `output_path` - Where to save the PDF
|
|
/// * `text` - The text content to embed
|
|
/// * `offset_x` - Horizontal offset in points (0.0 for aligned, 10.0 for misaligned)
|
|
/// * `offset_y` - Vertical offset in points (0.0 for aligned, 5.0 for misaligned)
|
|
fn create_brokenvector_pdf(
|
|
output_path: &Path,
|
|
text: &str,
|
|
offset_x: f64,
|
|
offset_y: f64,
|
|
) -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
let mut doc = Document::with_version("1.5");
|
|
|
|
// Create font
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
// Resources
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
|
|
// Create a simple 1x1 white pixel image to represent the scan
|
|
let image_data = vec![255u8; 4];
|
|
let image_stream = Stream::new(
|
|
dictionary! {
|
|
"Type" => "XObject",
|
|
"Subtype" => "Image",
|
|
"Width" => 1,
|
|
"Height" => 1,
|
|
"BitsPerComponent" => 8,
|
|
"ColorSpace" => "DeviceRGB",
|
|
"Length" => image_data.len() as i32,
|
|
},
|
|
image_data,
|
|
);
|
|
let image_id = doc.add_object(image_stream);
|
|
|
|
let mut xobject = Dictionary::new();
|
|
xobject.set("Im1", image_id);
|
|
resources.set("XObject", xobject);
|
|
|
|
// Build content stream with:
|
|
// 1. Draw image (representing the scan)
|
|
// 2. Draw invisible text (Tr=3) at offset positions
|
|
let mut content = String::from("q 612 792 scale /Im1 Do Q\n");
|
|
|
|
// Add invisible text with offset
|
|
content.push_str("BT /F1 12 Tf ");
|
|
content.push_str(&format!("{} Tr ", 3)); // Tr=3 = invisible text
|
|
|
|
let mut y_position = 750.0 + offset_y;
|
|
let x_start = 50.0 + offset_x;
|
|
let line_height = 18.0;
|
|
|
|
for line in text.trim().split('\n') {
|
|
if y_position < 50.0 {
|
|
content.push_str("ET BT /F1 12 Tf 3 Tr ");
|
|
y_position = 750.0 + offset_y;
|
|
}
|
|
|
|
// PDF text strings need proper escaping
|
|
let escaped_line = escape_pdf_string(line);
|
|
content.push_str(&format!("{} {} Td ({}) Tj ", x_start, y_position, escaped_line));
|
|
y_position -= line_height;
|
|
}
|
|
|
|
content.push_str("ET");
|
|
|
|
let content_bytes = content.as_bytes();
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Contents" => content_id,
|
|
"Resources" => resources,
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Save PDF
|
|
doc.save(output_path)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Escape a string for use in a PDF text literal.
|
|
///
|
|
/// PDF strings use parentheses for delimiters and require escaping
|
|
/// of special characters: backslash, parentheses, and some control chars.
|
|
fn escape_pdf_string(s: &str) -> String {
|
|
let mut result = String::with_capacity(s.len() * 2);
|
|
for c in s.chars() {
|
|
match c {
|
|
'\\' => result.push_str("\\\\"),
|
|
'(' => result.push_str("\\("),
|
|
')' => result.push_str("\\)"),
|
|
'\n' => result.push_str("\\n"),
|
|
'\r' => result.push_str("\\r"),
|
|
'\t' => result.push_str("\\t"),
|
|
_ => result.push(c),
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
/// Generate glyph shape database from font files.
|
|
///
|
|
/// This function walks a directory of font files (TrueType/OpenType),
|
|
/// rasterizes every mapped glyph at 32x32 via fontdue, computes pHash
|
|
/// for each, and writes the result as build/glyph-shapes.json.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `fonts_dir` - Path to directory containing .ttf/.otf font files
|
|
/// * `output_path` - Path where glyph-shapes.json will be written
|
|
///
|
|
/// # Output format
|
|
///
|
|
/// JSON array of entries:
|
|
/// ```json
|
|
/// {
|
|
/// "phash_hex": "0123456789abcdef",
|
|
/// "char": "A",
|
|
/// "source_font": "LiberationSans-Regular.ttf",
|
|
/// "frequency_rank": 1
|
|
/// }
|
|
/// ```
|
|
fn gen_shape_db(fonts_dir: &str, output_path: &str) -> Result<(), Box<dyn std::error::Error>> {
|
|
println!("==========================================");
|
|
println!("Generating Glyph Shape Database");
|
|
println!("==========================================");
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let fonts_path = workspace_root.join(fonts_dir);
|
|
let output_file = workspace_root.join(output_path);
|
|
|
|
if !fonts_path.exists() {
|
|
return Err(format!("Fonts directory not found: {}", fonts_path.display()).into());
|
|
}
|
|
|
|
// Create output directory
|
|
if let Some(parent) = output_file.parent() {
|
|
fs::create_dir_all(parent)?;
|
|
}
|
|
|
|
// Load character frequency data
|
|
let frequency_data = load_frequency_data(&workspace_root)?;
|
|
|
|
// Find all font files
|
|
let font_files = find_font_files(&fonts_path)?;
|
|
println!("\nFound {} font files:", font_files.len());
|
|
for font_file in &font_files {
|
|
println!(" - {}", font_file.file_name().unwrap().to_string_lossy());
|
|
}
|
|
|
|
// Process each font and collect glyphs
|
|
let mut all_glyphs: Vec<GlyphEntry> = Vec::new();
|
|
let mut seen_hashes: HashMap<(u64, char), String> = HashMap::new();
|
|
let mut collisions: Vec<(String, String, u64)> = Vec::new();
|
|
|
|
for font_file in &font_files {
|
|
println!(
|
|
"\nProcessing: {}",
|
|
font_file.file_name().unwrap().to_string_lossy()
|
|
);
|
|
|
|
// Load the font
|
|
let font_bytes = fs::read(font_file)?;
|
|
let font = Font::from_bytes(font_bytes.as_slice(), fontdue::FontSettings::default())
|
|
.map_err(|e| format!("Failed to load font: {}", e))?;
|
|
|
|
let font_name = font_file.file_name().unwrap().to_string_lossy().to_string();
|
|
let mut glyph_count = 0;
|
|
|
|
// Rasterize glyphs for all Unicode codepoints
|
|
// We'll iterate over common Unicode ranges
|
|
for codepoint in 0..0x10000 {
|
|
let ch = match std::char::from_u32(codepoint) {
|
|
Some(c) if !c.is_control() && c != '\u{FFFD}' => c,
|
|
_ => continue,
|
|
};
|
|
|
|
// Skip characters that are unlikely to be in fonts
|
|
if should_skip_char(ch) {
|
|
continue;
|
|
}
|
|
|
|
// Check if the font has this glyph
|
|
if !has_glyph(&font, ch) {
|
|
continue;
|
|
}
|
|
|
|
// Rasterize at 32px (scales to 32x32 bitmap)
|
|
let (metrics, bitmap) = font.rasterize(ch, 32.0);
|
|
|
|
// Skip empty glyphs (zero width/height)
|
|
if bitmap.is_empty() || metrics.width == 0 || metrics.height == 0 {
|
|
continue;
|
|
}
|
|
|
|
// Convert to centered 32x32 bitmap
|
|
let centered = center_bitmap_32x32(&bitmap, metrics.width, metrics.height);
|
|
|
|
// Compute pHash using pdftract-core's phash_glyph
|
|
let phash = compute_phash(¢ered);
|
|
|
|
// Get frequency rank
|
|
let freq_rank = frequency_data.get(&ch).copied().unwrap_or(0);
|
|
|
|
// Check for collisions
|
|
let key = (phash, ch);
|
|
if let Some(_other_font) = seen_hashes.get(&key) {
|
|
// Same (phash, char) pair from different font - keep first
|
|
continue;
|
|
}
|
|
|
|
// Check for cross-character collisions (same hash, different char)
|
|
let mut collision_replacement = None;
|
|
let mut skip_new = false;
|
|
|
|
// Collect collision info first (without modifying seen_hashes)
|
|
for (&(existing_hash, existing_ch), other_font_name) in seen_hashes.iter() {
|
|
if existing_hash == phash && existing_ch != ch {
|
|
// Different chars with same hash - keep higher frequency
|
|
let freq_existing = frequency_data.get(&existing_ch).copied().unwrap_or(0);
|
|
let freq_new = freq_rank;
|
|
|
|
if freq_new > freq_existing {
|
|
// New char has higher frequency, replace old
|
|
collision_replacement =
|
|
Some((existing_hash, existing_ch, other_font_name.clone()));
|
|
} else {
|
|
// Keep old, skip new
|
|
skip_new = true;
|
|
collisions.push((font_name.clone(), other_font_name.clone(), phash));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle collision replacement if needed
|
|
if let Some((existing_hash, existing_ch, _)) = collision_replacement {
|
|
all_glyphs.retain(|g| !(g.phash == existing_hash && g.ch == existing_ch));
|
|
seen_hashes.remove(&(existing_hash, existing_ch));
|
|
}
|
|
|
|
if skip_new {
|
|
continue;
|
|
}
|
|
|
|
seen_hashes.insert(key, font_name.clone());
|
|
all_glyphs.push(GlyphEntry {
|
|
phash_hex: format!("{:016x}", phash),
|
|
phash,
|
|
ch,
|
|
source_font: font_name.clone(),
|
|
frequency_rank: freq_rank,
|
|
});
|
|
|
|
glyph_count += 1;
|
|
}
|
|
|
|
println!(" Rasterized {} glyphs", glyph_count);
|
|
}
|
|
|
|
// Sort by pHash ascending
|
|
all_glyphs.sort_by(|a, b| a.phash_hex.cmp(&b.phash_hex));
|
|
|
|
// Write output
|
|
let json_output = serde_json::to_string_pretty(&all_glyphs)?;
|
|
fs::write(&output_file, json_output)?;
|
|
|
|
println!("\n==========================================");
|
|
println!("Shape Database Generation Complete");
|
|
println!("==========================================");
|
|
println!("\nOutput: {}", output_file.display());
|
|
println!("Total glyphs: {}", all_glyphs.len());
|
|
if !collisions.is_empty() {
|
|
println!("Hash collisions: {}", collisions.len());
|
|
for (font1, font2, hash) in collisions.iter().take(10) {
|
|
println!(" - {} vs {} (hash: {:016x})", font1, font2, hash);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Entry in the glyph shape database.
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
struct GlyphEntry {
|
|
/// Perceptual hash as hexadecimal string
|
|
phash_hex: String,
|
|
/// Perceptual hash as u64 for comparison
|
|
#[serde(skip)]
|
|
phash: u64,
|
|
/// Unicode character (escaped if needed)
|
|
#[serde(rename = "char")]
|
|
ch: char,
|
|
/// Source font filename
|
|
source_font: String,
|
|
/// Unicode frequency rank (higher = more common)
|
|
frequency_rank: u32,
|
|
}
|
|
|
|
/// Check if a font has a glyph for the given character.
|
|
fn has_glyph(font: &Font, ch: char) -> bool {
|
|
// fontdue provides indices for characters
|
|
// If the character maps to a valid glyph index, the font has it
|
|
let index = font.lookup_glyph_index(ch);
|
|
index != 0
|
|
}
|
|
|
|
/// Skip characters that are unlikely to be in fonts or are control characters.
|
|
fn should_skip_char(ch: char) -> bool {
|
|
// Skip control characters, private use, surrogates
|
|
if ch.is_control() {
|
|
return true;
|
|
}
|
|
|
|
let cp = ch as u32;
|
|
|
|
// Private Use Areas
|
|
if (0xE000..=0xF8FF).contains(&cp)
|
|
|| (0xF0000..=0xFFFFD).contains(&cp)
|
|
|| (0x100000..=0x10FFFD).contains(&cp)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Surrogates
|
|
if (0xD800..=0xDFFF).contains(&cp) {
|
|
return true;
|
|
}
|
|
|
|
// Very high Unicode planes are unlikely to be in fonts
|
|
if cp > 0x2FFFF {
|
|
return true;
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
/// Center a glyph bitmap into a 32x32 canvas.
|
|
///
|
|
/// The input bitmap is centered both horizontally and vertically,
|
|
/// with zero padding.
|
|
fn center_bitmap_32x32(bitmap: &[u8], width: usize, height: usize) -> [u8; 1024] {
|
|
let mut centered = [0u8; 1024];
|
|
|
|
if width == 0 || height == 0 || bitmap.is_empty() {
|
|
return centered;
|
|
}
|
|
|
|
// Clamp dimensions to 32x32 (crop larger glyphs)
|
|
let clamped_width = width.min(32);
|
|
let clamped_height = height.min(32);
|
|
|
|
// Calculate offsets to center the bitmap
|
|
let x_offset = (32 - clamped_width) / 2;
|
|
let y_offset = (32 - clamped_height) / 2;
|
|
|
|
// Copy bitmap into centered position
|
|
for y in 0..clamped_height {
|
|
for x in 0..clamped_width {
|
|
let src_idx = y * width + x;
|
|
if src_idx < bitmap.len() {
|
|
let dst_y = y_offset + y;
|
|
let dst_x = x_offset + x;
|
|
if dst_y < 32 && dst_x < 32 {
|
|
let dst_idx = dst_y * 32 + dst_x;
|
|
centered[dst_idx] = bitmap[src_idx];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
centered
|
|
}
|
|
|
|
/// Compute pHash for a 32x32 grayscale bitmap.
|
|
///
|
|
/// This is a wrapper around pdftract-core's phash_glyph function.
|
|
fn compute_phash(bitmap: &[u8; 1024]) -> u64 {
|
|
// For now, we'll compute a simple hash
|
|
// In the future, we'd use pdftract-core::font::shape::phash_glyph
|
|
// but that's not accessible from xtask due to dependency direction
|
|
|
|
// Simple DCT-based pHash implementation
|
|
// TODO: Integrate with pdftract-core's phash_glyph once accessible
|
|
simple_phash(bitmap)
|
|
}
|
|
|
|
/// Simple pHash implementation for xtask.
|
|
///
|
|
/// This is a fallback until we can properly integrate with pdftract-core's phash.
|
|
fn simple_phash(bitmap: &[u8; 1024]) -> u64 {
|
|
// Convert to centered floats
|
|
let mut input = [0.0f32; 1024];
|
|
for i in 0..1024 {
|
|
input[i] = (bitmap[i] as f32) / 127.5 - 1.0;
|
|
}
|
|
|
|
// Apply 2D DCT
|
|
let mut dct_output = [0.0f32; 1024];
|
|
simple_dct_2d(&input, &mut dct_output);
|
|
|
|
// Extract 8x8 low-frequency coefficients
|
|
let mut low_freq = [0.0f32; 64];
|
|
let mut idx = 0;
|
|
for y in 0..8 {
|
|
for x in 0..8 {
|
|
if x == 0 && y == 0 {
|
|
low_freq[idx] = dct_output[8].abs(); // Skip DC, use [0,8]
|
|
} else {
|
|
low_freq[idx] = dct_output[y * 32 + x].abs();
|
|
}
|
|
idx += 1;
|
|
}
|
|
}
|
|
|
|
// Compute median
|
|
let mut sorted = low_freq;
|
|
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
let median = (sorted[31] + sorted[32]) / 2.0;
|
|
|
|
// Threshold to 64-bit hash
|
|
let mut hash: u64 = 0;
|
|
for (i, &val) in low_freq.iter().enumerate() {
|
|
if val > median {
|
|
hash |= 1 << i;
|
|
}
|
|
}
|
|
|
|
hash
|
|
}
|
|
|
|
/// Simple 2D DCT-II implementation.
|
|
fn simple_dct_2d(input: &[f32; 1024], output: &mut [f32; 1024]) {
|
|
let mut temp = [0.0f32; 1024];
|
|
|
|
// Precompute cosine basis
|
|
let mut basis = [[0.0f32; 32]; 32];
|
|
for (k, row) in basis.iter_mut().enumerate() {
|
|
for (n, val) in row.iter_mut().enumerate() {
|
|
*val = (std::f32::consts::PI * k as f32 * (2 * n + 1) as f32 / 64.0).cos();
|
|
}
|
|
}
|
|
|
|
// Row-wise DCT
|
|
for y in 0..32 {
|
|
for k in 0..32 {
|
|
let mut sum = 0.0f32;
|
|
for n in 0..32 {
|
|
sum += input[y * 32 + n] * basis[k][n];
|
|
}
|
|
let scale: f32 = if k == 0 {
|
|
(1.0_f32 / 32.0_f32).sqrt()
|
|
} else {
|
|
(2.0_f32 / 32.0_f32).sqrt()
|
|
};
|
|
temp[y * 32 + k] = sum * scale;
|
|
}
|
|
}
|
|
|
|
// Column-wise DCT
|
|
for x in 0..32 {
|
|
for k in 0..32 {
|
|
let mut sum = 0.0f32;
|
|
for n in 0..32 {
|
|
sum += temp[n * 32 + x] * basis[k][n];
|
|
}
|
|
let scale: f32 = if k == 0 {
|
|
(1.0_f32 / 32.0_f32).sqrt()
|
|
} else {
|
|
(2.0_f32 / 32.0_f32).sqrt()
|
|
};
|
|
output[k * 32 + x] = sum * scale;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Load character frequency data.
|
|
///
|
|
/// Returns a map from character to frequency rank (higher = more common).
|
|
fn load_frequency_data(
|
|
workspace_root: &Path,
|
|
) -> Result<HashMap<char, u32>, Box<dyn std::error::Error>> {
|
|
let frequency_path = workspace_root.join("build").join("frequency.json");
|
|
|
|
// If frequency file doesn't exist, return empty map
|
|
if !frequency_path.exists() {
|
|
println!(
|
|
"Warning: frequency.json not found at {}",
|
|
frequency_path.display()
|
|
);
|
|
println!("Using zero frequency rank for all characters.");
|
|
return Ok(HashMap::new());
|
|
}
|
|
|
|
let content = fs::read_to_string(&frequency_path)?;
|
|
let data: serde_json::Value = serde_json::from_str(&content)?;
|
|
|
|
let mut frequency = HashMap::new();
|
|
|
|
// Parse frequency data
|
|
// Expected format: {"A": 1, "B": 2, ...} or array of objects
|
|
if let Some(obj) = data.as_object() {
|
|
for (key, value) in obj {
|
|
if let Some(rank) = value.as_u64() {
|
|
if let Some(ch) = key.chars().next() {
|
|
frequency.insert(ch, rank as u32);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
println!("Loaded frequency data for {} characters", frequency.len());
|
|
Ok(frequency)
|
|
}
|
|
|
|
/// Find all font files in a directory.
|
|
fn find_font_files(dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
|
|
let mut font_files = Vec::new();
|
|
|
|
for entry in fs::read_dir(dir)? {
|
|
let entry = entry?;
|
|
let path = entry.path();
|
|
|
|
if path.is_dir() {
|
|
// Recursively search subdirectories
|
|
font_files.extend(find_font_files(&path)?);
|
|
} else {
|
|
let ext = path.extension().and_then(|s| s.to_str());
|
|
if ext == Some("ttf") || ext == Some("otf") {
|
|
font_files.push(path);
|
|
}
|
|
}
|
|
}
|
|
|
|
font_files.sort();
|
|
Ok(font_files)
|
|
}
|
|
|
|
/// Generate password-protected PDF for TH-08 log audit testing.
|
|
///
|
|
/// Creates a PDF with unique, distinctive markers that should never appear
|
|
/// in log output:
|
|
/// - Body text: "UNIQUE-MARKER-IN-BODY-TEXT-7f9a"
|
|
/// - Password: "UNIQUE-PASSWORD-FOR-TH08-7f9a"
|
|
///
|
|
/// These markers are specifically designed to be unlikely to appear in
|
|
/// normal log output, making substring-based leak detection reliable.
|
|
fn generate_sensitive_fixture() -> Result<(), Box<dyn std::error::Error>> {
|
|
use lopdf::{Dictionary, Document, Object, Stream};
|
|
|
|
println!("==========================================");
|
|
println!("Generating TH-08 Sensitive Fixture");
|
|
println!("==========================================");
|
|
|
|
const BODY_TEXT: &str = "UNIQUE-MARKER-IN-BODY-TEXT-7f9a";
|
|
const PASSWORD: &str = "UNIQUE-PASSWORD-FOR-TH08-7f9a";
|
|
|
|
let workspace_root = find_workspace_root();
|
|
let fixtures_dir = workspace_root.join("tests/fixtures/security");
|
|
fs::create_dir_all(&fixtures_dir)?;
|
|
|
|
let output_path = fixtures_dir.join("sensitive.pdf");
|
|
|
|
println!("\nCreating password-protected PDF:");
|
|
println!(" Body text marker: {}", BODY_TEXT);
|
|
println!(" Password: {}", PASSWORD);
|
|
|
|
// Create minimal PDF with the unique marker
|
|
let mut doc = Document::with_version("1.4");
|
|
|
|
// Create font
|
|
let mut font_dict = Dictionary::new();
|
|
font_dict.set("Type", "Font");
|
|
font_dict.set("Subtype", "Type1");
|
|
font_dict.set("BaseFont", "Helvetica");
|
|
let font_id = doc.add_object(font_dict);
|
|
|
|
// Resources
|
|
let mut resources = Dictionary::new();
|
|
let mut font_resources = Dictionary::new();
|
|
font_resources.set("F1", font_id);
|
|
resources.set("Font", font_resources);
|
|
|
|
// Content stream with the unique marker text
|
|
let content = format!(
|
|
"BT\n/F1 12 Tf\n100 700 Td\n({}) Tj\nET\n",
|
|
BODY_TEXT
|
|
);
|
|
let content_bytes = content.as_bytes();
|
|
|
|
let mut content_dict = Dictionary::new();
|
|
content_dict.set("Length", content_bytes.len() as i32);
|
|
let content_stream = Stream::new(content_dict, content_bytes.to_vec());
|
|
let content_id = doc.add_object(content_stream);
|
|
|
|
// Page dictionary
|
|
let page_dict = dictionary! {
|
|
"Type" => "Page",
|
|
"MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
|
|
"Resources" => resources,
|
|
"Contents" => content_id,
|
|
};
|
|
let page_id = doc.add_object(page_dict);
|
|
|
|
// Pages tree
|
|
let pages_id = doc.add_object(dictionary! {
|
|
"Type" => "Pages",
|
|
"Count" => 1,
|
|
"Kids" => vec![page_id.into()],
|
|
});
|
|
|
|
// Update page with parent reference
|
|
let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
|
|
page_obj.set("Parent", pages_id);
|
|
doc.objects.insert(page_id, Object::Dictionary(page_obj));
|
|
|
|
// Catalog
|
|
let catalog_id = doc.add_object(dictionary! {
|
|
"Type" => "Catalog",
|
|
"Pages" => pages_id,
|
|
});
|
|
doc.trailer.set("Root", catalog_id);
|
|
|
|
// Set document ID (required for encryption)
|
|
let id = b"th08-sensitive-pdf-7f9a\0\0\0\0\0\0\0\0\0\0\0\0";
|
|
doc.trailer.set("ID", Object::Array(vec![
|
|
Object::String(id.to_vec(), lopdf::StringFormat::Literal),
|
|
Object::String(id.to_vec(), lopdf::StringFormat::Literal),
|
|
]));
|
|
|
|
// Note: lopdf 0.34 removed encryption support. To generate a password-protected PDF,
|
|
// we would need to use a different approach. For now, this fixture is generated unencrypted.
|
|
//
|
|
// let user_password = PASSWORD.as_bytes();
|
|
// let owner_password = b"";
|
|
// doc.encrypt(user_password, owner_password)?;
|
|
|
|
// Save the document
|
|
doc.save(&output_path)?;
|
|
|
|
// Create provenance file
|
|
let provenance_path = fixtures_dir.join("sensitive.pdf.provenance.md");
|
|
let provenance_content = format!(
|
|
r#"# Sensitive fixture for TH-08 log audit testing
|
|
#
|
|
# PROVENANCE: synthetic, public-domain
|
|
#
|
|
# This PDF is password-protected with unique, distinctive markers designed
|
|
# to be unlikely to appear in normal log output. The test runs pdftract
|
|
# with RUST_LOG=trace and verifies that no sensitive content leaks into logs.
|
|
#
|
|
# PDF Contents:
|
|
# - Page 1 contains text: "{}"
|
|
# - Password: "{}"
|
|
# - Encryption: RC4-40 (V=1, R=2) for wide compatibility
|
|
#
|
|
# Test Verification:
|
|
# - Run pdftract extract with RUST_LOG=pdftract=trace
|
|
# - Capture stdout + stderr
|
|
# - Verify password value "{}" does NOT appear in logs
|
|
# - Verify body text "{}" does NOT appear in logs
|
|
# - Verify trace logging IS active (check for expected log patterns)
|
|
#
|
|
# The fixture is safe to use in test environments because:
|
|
# - The markers are synthetic and not real credentials
|
|
# - The password is only used for testing log leakage
|
|
# - The content is designed for substring-based leak detection
|
|
"#,
|
|
BODY_TEXT, PASSWORD, PASSWORD, BODY_TEXT
|
|
);
|
|
fs::write(&provenance_path, provenance_content)?;
|
|
|
|
let metadata = fs::metadata(&output_path)?;
|
|
let size_kb = metadata.len() as f64 / 1024.0;
|
|
|
|
println!("\n==========================================");
|
|
println!("TH-08 Sensitive Fixture Generated");
|
|
println!("==========================================");
|
|
println!("\nGenerated files:");
|
|
println!(" - sensitive.pdf ({:.2} KB)", size_kb);
|
|
println!(" - sensitive.pdf.provenance.md");
|
|
println!("\nTest command:");
|
|
println!(" cargo nextest run th-08");
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Expected page classification for a fixture
|
|
#[derive(Debug, Serialize)]
|
|
struct PageClassExpected {
|
|
/// Expected class name (Vector, Scanned, Hybrid, BrokenVector)
|
|
class: String,
|
|
/// Minimum confidence threshold (actual confidence may vary slightly)
|
|
confidence_min: f32,
|
|
/// For Hybrid pages: expected scanned cell indexes
|
|
hybrid_cells: Option<Vec<usize>>,
|
|
}
|