use fontdue::Font; use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, HashMap}; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::time::{Duration, Instant}; /// Helper macro for creating dictionaries macro_rules! dictionary { ($( $key:literal => $value:expr ),* $(,)?) => {{ let mut dict = Dictionary::new(); $( dict.set($key, $value); )* dict }}; } /// Find the workspace root directory by searching for Cargo.toml fn find_workspace_root() -> PathBuf { let mut current = std::env::current_dir().unwrap(); // If we're in the xtask directory, go to parent if current.ends_with("xtask") { current = current.parent().unwrap().to_path_buf(); } // Search upward for Cargo.toml with workspace members loop { let cargo_toml = current.join("Cargo.toml"); if cargo_toml.exists() { let content = fs::read_to_string(&cargo_toml).unwrap_or_default(); if content.contains("[workspace]") { return current; } } match current.parent() { Some(parent) => current = parent.to_path_buf(), None => break, } } // Fallback: use current directory if not found std::env::current_dir().unwrap() } #[derive(Debug, Deserialize)] struct Profile { description: String, #[serde(default)] profile_fields: BTreeMap, #[serde(default)] r#match: MatchConfig, } #[derive(Debug, Deserialize)] struct ProfileField { #[serde(rename = "type")] field_type: String, #[serde(default)] extraction: ExtractionConfig, } #[derive(Debug, Deserialize, Default)] struct ExtractionConfig { #[serde(default)] patterns: Vec, #[serde(default)] region_hint: Option, #[serde(default)] table_region: Option, #[serde(default)] columnar_regions: Option, #[serde(default)] per_page: Option, #[serde(default)] #[allow(dead_code)] fallback: serde_yaml::Value, } #[derive(Debug, Deserialize, Default)] struct MatchConfig { #[serde(default)] any: Vec, } #[derive(Debug, Deserialize, Default)] struct MatchClause { #[serde(default)] text_patterns: Vec, #[serde(default)] structural: Vec, } fn main() -> Result<(), Box> { let args: Vec = std::env::args().collect(); if args.len() < 2 { eprintln!("Usage: xtask "); eprintln!("Commands:"); eprintln!(" doc-profile Generate README skeleton for a profile"); eprintln!(" doc-profiles Generate README skeletons for all profiles"); eprintln!(" generate-stress-pdfs Generate stress-test PDFs for memory ceiling testing"); eprintln!(" generate-page-class-fixtures Generate page classification test fixtures"); eprintln!(" gen-schema Generate JSON Schema from Rust output types"); eprintln!( " gen-shape-db Generate glyph shape database from font files" ); eprintln!(" memory-ceiling Run memory ceiling tests against perf/malformed corpora"); std::process::exit(1); } let result = match args[1].as_str() { "doc-profile" => { if args.len() < 3 { eprintln!("Usage: xtask doc-profile "); std::process::exit(1); } generate_profile_readme(&args[2])?; Ok(()) } "doc-profiles" => { let profiles_dir = find_workspace_root().join("profiles/builtin"); for entry in fs::read_dir(&profiles_dir)? { let entry = entry?; if entry.path().is_dir() { let profile_name = entry.file_name().to_string_lossy().to_string(); if let Err(e) = generate_profile_readme(&profile_name) { eprintln!("Error generating README for {}: {}", profile_name, e); } } } Ok(()) } "generate-stress-pdfs" => { generate_stress_pdfs()?; Ok(()) } "generate-page-class-fixtures" => { generate_page_class_fixtures()?; Ok(()) } "gen-schema" => { gen_schema()?; Ok(()) } "memory-ceiling" => { run_memory_ceiling_tests()?; Ok(()) } "gen-shape-db" => { let fonts_dir = if args.len() >= 3 { args[2].clone() } else { eprintln!("Usage: xtask gen-shape-db "); std::process::exit(1); }; let output_path = if args.len() >= 4 { args[3].clone() } else { "build/glyph-shapes.json".to_string() }; gen_shape_db(&fonts_dir, &output_path)?; Ok(()) } _ => { eprintln!("Unknown command: {}", args[1]); std::process::exit(1); } }; result } /// Generate JSON Schema from Rust output types. /// /// Delegates to the gen_schema binary. fn gen_schema() -> Result<(), Box> { // Invoke the gen_schema binary let status = std::process::Command::new("cargo") .args(["run", "--bin", "gen_schema"]) .current_dir(find_workspace_root()) .status()?; if !status.success() { return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into()); } Ok(()) } fn generate_profile_readme(profile_name: &str) -> Result<(), Box> { // Find the workspace root by looking for the parent directory's Cargo.toml let workspace_root = find_workspace_root(); let profile_path = workspace_root .join("profiles/builtin") .join(profile_name) .join("profile.yaml"); let readme_path = workspace_root .join("profiles/builtin") .join(profile_name) .join("README.md"); if !profile_path.exists() { return Err(format!("Profile YAML not found: {}", profile_path.display()).into()); } let yaml_content = fs::read_to_string(&profile_path)?; let profile: Profile = serde_yaml::from_str(&yaml_content)?; let mut readme = String::new(); // Title and description readme.push_str(&format!("# {} Profile\n\n", profile_name.to_uppercase())); readme.push_str(&format!("{}\n\n", profile.description)); // Match Criteria Summary (placeholder for human to fill) readme.push_str("## Match Criteria Summary\n\n"); readme.push_str("*This section describes the characteristics that cause a document to match this profile. The following signals are considered:*\n\n"); // Collect all text patterns and structural signals from any clause let mut all_patterns: Vec<&String> = Vec::new(); let mut all_structural: Vec = Vec::new(); for clause in &profile.r#match.any { for pattern in &clause.text_patterns { if !all_patterns.contains(&pattern) { all_patterns.push(pattern); } } for signal in &clause.structural { let signal_str = format!("{:?}", signal); if !all_structural.iter().any(|s| s == &signal_str) { all_structural.push(signal_str); } } } // Show first few patterns as examples if !all_patterns.is_empty() { let show_count = all_patterns.len().min(3); readme.push_str("- **Text patterns**: "); for (i, pattern) in all_patterns.iter().take(show_count).enumerate() { if i > 0 { readme.push_str(", "); } readme.push_str(&format!("`{}`", pattern)); } if all_patterns.len() > show_count { readme.push_str(&format!(" ({} more)", all_patterns.len() - show_count)); } readme.push('\n'); } if !all_structural.is_empty() { let show_count = all_structural.len().min(3); readme.push_str("- **Structural signals**: "); for (i, signal) in all_structural.iter().take(show_count).enumerate() { if i > 0 { readme.push_str(", "); } readme.push_str(&format!("`{}`", signal)); } if all_structural.len() > show_count { readme.push_str(&format!(" ({} more)", all_structural.len() - show_count)); } readme.push('\n'); } readme.push_str( "\n*Additional heuristics and confidence scoring are applied during classification.*\n\n", ); // Extracted Fields readme.push_str("## Extracted Fields\n\n"); readme.push_str("| Field | Type | Description | Example Value | Source Hint |\n"); readme.push_str("|-------|------|-------------|----------------|-------------|\n"); for (field_name, field) in &profile.profile_fields { let description = "Extracted from page text using pattern matching".to_string(); let example = match field.field_type.as_str() { "string" => "\"example value\"", "decimal" => "123.45", "date" => "2024-01-15", "int" => "42", "array" => "[...]", _ => "N/A", }; let mut source_parts = Vec::new(); if !field.extraction.patterns.is_empty() { source_parts.push("regex patterns".to_string()); } if let Some(ref hint) = field.extraction.region_hint { source_parts.push(format!("region: {}", hint)); } if let Some(ref table) = field.extraction.table_region { source_parts.push(format!("table: {}", table)); } if let Some(ref cols) = field.extraction.columnar_regions { source_parts.push(format!("columns: {}", cols)); } if field.extraction.per_page.unwrap_or(false) { source_parts.push("per-page".to_string()); } let source = if source_parts.is_empty() { "profile YAML".to_string() } else { source_parts.join(", ") }; readme.push_str(&format!( "| {} | {} | {} | {} | {} |\n", field_name, field.field_type, description, example, source )); } if profile.profile_fields.is_empty() { readme.push_str("| *(none)* | - | *This profile has no field extractors* | - | - |\n"); } readme.push('\n'); // Known Limitations readme.push_str("## Known Limitations\n\n"); readme.push_str("*This section documents known edge cases and failure modes. Contributions to improve extraction quality are welcome.*\n\n"); readme.push_str("- *Document limitations and edge cases to be added by profile author*\n\n"); // Sample Input Pointer readme.push_str("## Sample Input\n\n"); readme.push_str(&format!("Example fixtures demonstrating this profile are available in `tests/fixtures/profiles/{}/`.\n\n", profile_name)); readme.push_str("*See the classifier corpus for representative documents.*\n\n"); // Configuration Tips readme.push_str("## Configuration Tips\n\n"); readme.push_str("To override this profile:\n\n"); readme.push_str("```bash\n"); readme.push_str(&format!( "pdftract profiles export {} > my-profile.yaml\n", profile_name )); readme.push_str( "# Edit my-profile.yaml to customize match criteria, fields, or extraction patterns\n", ); readme.push_str("pdftract extract --profile my-profile.yaml document.pdf\n"); readme.push_str("```\n\n"); // Footer readme.push_str("---\n\n*This README was auto-generated from `profile.yaml`. Update the Match Criteria Summary and Known Limitations sections with profile-specific guidance.*\n"); fs::write(&readme_path, readme)?; println!( "Generated README for {} at {}", profile_name, readme_path.display() ); Ok(()) } /// Generate stress-test PDFs for memory ceiling testing /// /// Creates large-page-count PDFs to validate memory targets: /// - 100-page vector PDF for buffered mode testing (target: < 512 MB) /// - 10,000-page stress test for streaming mode validation (target: < 256 MB) fn generate_stress_pdfs() -> Result<(), Box> { println!("=========================================="); println!("Generating Stress-Test PDFs"); println!("=========================================="); let workspace_root = find_workspace_root(); let perf_dir = workspace_root.join("tests/fixtures/perf"); fs::create_dir_all(&perf_dir)?; let configs = vec![ ( 100, "100-page-vector.pdf", "Buffered mode stress test (512 MB budget)", ), ( 10000, "10k-page.pdf", "Streaming mode stress test (256 MB budget)", ), ]; for (num_pages, filename, description) in &configs { println!("\nGenerating: {} ({} pages)", filename, num_pages); println!(" Purpose: {}", description); let output_path = perf_dir.join(filename); generate_stress_pdf(&output_path, *num_pages)?; } println!("\n=========================================="); println!("Stress-Test PDF Generation Complete"); println!("=========================================="); println!("\nGenerated files:"); for (_, filename, _) in &configs { let path = perf_dir.join(filename); if path.exists() { let metadata = fs::metadata(&path)?; let size_mb = metadata.len() as f64 / 1024.0 / 1024.0; println!(" - {} ({:.2} MB)", filename, size_mb); } } Ok(()) } /// Generate a multi-page stress-test PDF /// /// Creates a PDF with the specified number of pages for memory ceiling testing. /// Uses a minimal approach with lopdf 0.34. fn generate_stress_pdf( output_path: &Path, num_pages: usize, ) -> Result<(), Box> { use lopdf::{Dictionary, Document, Object, Stream}; let mut doc = Document::with_version("1.5"); // Pre-create fonts and resources that will be reused let mut font_dict = Dictionary::new(); font_dict.set("Type", "Font"); font_dict.set("Subtype", "Type1"); font_dict.set("BaseFont", "Helvetica"); let font_id = doc.add_object(font_dict); let mut resources = Dictionary::new(); let mut font_resources = Dictionary::new(); font_resources.set("F1", font_id); resources.set("Font", font_resources); // Create all page objects first let mut page_ids = Vec::new(); let mediabox = Object::Array(vec![ Object::Real(0.0), Object::Real(0.0), Object::Real(612.0), Object::Real(792.0), ]); for page_num in 1..=num_pages { // Create content stream for this page let content_bytes = format!( "BT /F1 12 Tf 72 720 Td (Page {} of {}) Tj ET", page_num, num_pages ) .into_bytes(); let mut content_dict = Dictionary::new(); content_dict.set("Length", content_bytes.len() as i32); let content_stream = Stream::new(content_dict, content_bytes); let content_id = doc.add_object(content_stream); // Create page dictionary let mut page_dict = Dictionary::new(); page_dict.set("Type", "Page"); page_dict.set("MediaBox", mediabox.clone()); page_dict.set("Contents", content_id); page_dict.set("Resources", resources.clone()); let page_id = doc.add_object(page_dict); page_ids.push(page_id); } // Create the Pages root dictionary (Pages tree) let mut pages_dict = Dictionary::new(); pages_dict.set("Type", "Pages"); pages_dict.set("Count", Object::Integer(num_pages as i64)); pages_dict.set( "Kids", Object::Array(page_ids.iter().map(|&id| Object::Reference(id)).collect()), ); let pages_id = doc.add_object(pages_dict); // Set Parent reference for each page for &page_id in &page_ids { let page_obj = doc.get_object(page_id)?; if let Ok(dict) = page_obj.as_dict() { let mut updated_dict = dict.clone(); updated_dict.set("Parent", pages_id); // Need to replace the object let _ = doc .objects .insert(page_id, Object::Dictionary(updated_dict)); } } // Create the Catalog dictionary let mut catalog_dict = Dictionary::new(); catalog_dict.set("Type", "Catalog"); catalog_dict.set("Pages", pages_id); let catalog_id = doc.add_object(catalog_dict); // Set the document's catalog ID directly doc.trailer.set("Root", catalog_id); // Save the document doc.save(output_path)?; let metadata = fs::metadata(output_path)?; let size_mb = metadata.len() as f64 / 1024.0 / 1024.0; println!( " Generated: {} ({:.2} MB)", output_path.file_name().unwrap().to_string_lossy(), size_mb ); Ok(()) } /// Memory budgets for different document categories (in MB) #[derive(Debug, Clone)] struct MemoryBudget { pub buffered_100_page: usize, // 512 MB pub streaming_any: usize, // 256 MB pub adversarial_hard_cap: usize, // 1 GB } impl Default for MemoryBudget { fn default() -> Self { Self { buffered_100_page: 512, streaming_any: 256, adversarial_hard_cap: 1024, } } } #[derive(Debug, Serialize)] struct MemoryMeasurement { pub peak_rss_mb: usize, pub duration_ms: u128, pub succeeded: bool, pub error_message: Option, } #[derive(Debug, Clone, Serialize)] struct MemoryTestResult { pub file_name: String, pub category: String, // "buffered", "streaming", "adversarial" pub peak_rss_mb: usize, pub duration_ms: u128, pub budget_mb: usize, pub passed: bool, pub error_message: Option, } #[derive(Debug, Serialize)] struct MemoryReport { pub timestamp: String, pub commit_sha: Option, pub budgets: MemoryBudgetJson, pub results: Vec, pub summary: MemorySummary, } #[derive(Debug, Serialize)] struct MemoryBudgetJson { pub buffered_100_page_mb: usize, pub streaming_any_mb: usize, pub adversarial_hard_cap_mb: usize, } #[derive(Debug, Serialize)] struct MemorySummary { pub total_tests: usize, pub passed: usize, pub failed: usize, pub all_passed: bool, } /// Run memory ceiling tests against perf and malformed corpora /// /// This enforces the Tier-1 Memory targets from the plan: /// - Peak RSS, 100-page vector PDF (buffered mode) < 512 MB /// - Peak RSS, streaming/NDJSON mode < 256 MB /// - Peak RSS, adversarial fixtures < 1 GB hard ceiling /// /// Analogous to cargo-bloat for memory usage: fails the build if any /// document exceeds its budget. /// /// Generates memory-report.json artifact for CI historical tracking. fn run_memory_ceiling_tests() -> Result<(), Box> { println!("=========================================="); println!("Memory Ceiling Tests"); println!("=========================================="); let budgets = MemoryBudget::default(); let workspace_root = find_workspace_root(); let perf_dir = workspace_root.join("tests/fixtures/perf"); let malformed_dir = workspace_root.join("tests/fixtures/malformed"); println!("\nMemory budgets:"); println!(" - Buffered 100-page: {} MB", budgets.buffered_100_page); println!(" - Streaming mode: {} MB", budgets.streaming_any); println!( " - Adversarial hard cap: {} MB", budgets.adversarial_hard_cap ); // Build pdftract binary first println!("\n=== Building pdftract for testing ==="); let build_status = Command::new("cargo") .args(["build", "--release", "--bin", "pdftract", "--locked"]) .current_dir(&workspace_root) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .status()?; if !build_status.success() { return Err("Failed to build pdftract binary".into()); } let binary_path = workspace_root.join("target/release/pdftract"); if !binary_path.exists() { return Err(format!("pdftract binary not found at {}", binary_path.display()).into()); } println!("Binary: {}", binary_path.display()); let mut all_results = Vec::new(); let mut all_passed = true; // Test 1: Perf corpus - buffered mode (512 MB budget) println!( "\n=== Testing perf corpus (buffered mode, budget: {} MB) ===", budgets.buffered_100_page ); if perf_dir.exists() { for entry in fs::read_dir(&perf_dir)? { let entry = entry?; let path = entry.path(); if path.extension().and_then(|s| s.to_str()) != Some("pdf") { continue; } let file_name = path.file_name().unwrap().to_string_lossy().to_string(); print!(" [buffered] {} ... ", file_name); match measure_extraction(&binary_path, &path, &budgets, false) { Ok(measurement) => { let passed = measurement.peak_rss_mb <= budgets.buffered_100_page; if passed { println!( "PASS ({} MB, {} ms)", measurement.peak_rss_mb, measurement.duration_ms ); } else { println!( "FAIL ({} MB > {} MB)", measurement.peak_rss_mb, budgets.buffered_100_page ); all_passed = false; } all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "buffered".to_string(), peak_rss_mb: measurement.peak_rss_mb, duration_ms: measurement.duration_ms, budget_mb: budgets.buffered_100_page, passed, error_message: measurement.error_message, }); } Err(e) => { println!("ERROR ({})", e); all_passed = false; all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "buffered".to_string(), peak_rss_mb: 0, duration_ms: 0, budget_mb: budgets.buffered_100_page, passed: false, error_message: Some(e.to_string()), }); } } } } else { println!(" (no perf directory)"); } // Test 2: Perf corpus - streaming mode (256 MB budget) println!( "\n=== Testing perf corpus (streaming mode, budget: {} MB) ===", budgets.streaming_any ); if perf_dir.exists() { for entry in fs::read_dir(&perf_dir)? { let entry = entry?; let path = entry.path(); if path.extension().and_then(|s| s.to_str()) != Some("pdf") { continue; } let file_name = path.file_name().unwrap().to_string_lossy().to_string(); print!(" [streaming] {} ... ", file_name); match measure_extraction(&binary_path, &path, &budgets, true) { Ok(measurement) => { let passed = measurement.peak_rss_mb <= budgets.streaming_any; if passed { println!( "PASS ({} MB, {} ms)", measurement.peak_rss_mb, measurement.duration_ms ); } else { println!( "FAIL ({} MB > {} MB)", measurement.peak_rss_mb, budgets.streaming_any ); all_passed = false; } all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "streaming".to_string(), peak_rss_mb: measurement.peak_rss_mb, duration_ms: measurement.duration_ms, budget_mb: budgets.streaming_any, passed, error_message: measurement.error_message, }); } Err(e) => { println!("ERROR ({})", e); all_passed = false; all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "streaming".to_string(), peak_rss_mb: 0, duration_ms: 0, budget_mb: budgets.streaming_any, passed: false, error_message: Some(e.to_string()), }); } } } } // Test 3: Malformed corpus - adversarial hard cap (1 GB budget) println!( "\n=== Testing malformed corpus (adversarial hard cap: {} MB) ===", budgets.adversarial_hard_cap ); if malformed_dir.exists() { for entry in fs::read_dir(&malformed_dir)? { let entry = entry?; let path = entry.path(); if path.extension().and_then(|s| s.to_str()) != Some("pdf") && path.extension().and_then(|s| s.to_str()) != Some("bin") { continue; } let file_name = path.file_name().unwrap().to_string_lossy().to_string(); print!(" [adversarial] {} ... ", file_name); match measure_extraction(&binary_path, &path, &budgets, false) { Ok(measurement) => { let passed = measurement.peak_rss_mb <= budgets.adversarial_hard_cap; if passed { println!( "PASS ({} MB, {} ms)", measurement.peak_rss_mb, measurement.duration_ms ); } else { println!( "FAIL ({} MB > {} MB)", measurement.peak_rss_mb, budgets.adversarial_hard_cap ); all_passed = false; } all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "adversarial".to_string(), peak_rss_mb: measurement.peak_rss_mb, duration_ms: measurement.duration_ms, budget_mb: budgets.adversarial_hard_cap, passed, error_message: measurement.error_message, }); } Err(e) => { println!("ERROR ({})", e); all_passed = false; all_results.push(MemoryTestResult { file_name: file_name.clone(), category: "adversarial".to_string(), peak_rss_mb: 0, duration_ms: 0, budget_mb: budgets.adversarial_hard_cap, passed: false, error_message: Some(e.to_string()), }); } } } } else { println!(" (no malformed directory)"); } // Print summary println!("\n=========================================="); println!("Memory Ceiling Summary"); println!("=========================================="); let passed_count = all_results.iter().filter(|r| r.passed).count(); let total_count = all_results.len(); println!("Passed: {}/{}", passed_count, total_count); if !all_passed { println!("\nFailed documents:"); for result in &all_results { if !result.passed { if result.peak_rss_mb > 0 { println!( " - [{}] {} ({} MB > {} MB)", result.category, result.file_name, result.peak_rss_mb, result.budget_mb ); } else { println!( " - [{}] {} (error: {})", result.category, result.file_name, result.error_message.as_deref().unwrap_or("unknown") ); } } } println!("\nMemory ceiling gate FAILED!"); return Err("Memory ceiling exceeded".into()); } println!("\nMemory ceiling gate PASSED!"); // Generate JSON report let report = MemoryReport { timestamp: format!( "{}", humantime::format_rfc3339_seconds(std::time::SystemTime::now()) ), commit_sha: get_commit_sha()?, budgets: MemoryBudgetJson { buffered_100_page_mb: budgets.buffered_100_page, streaming_any_mb: budgets.streaming_any, adversarial_hard_cap_mb: budgets.adversarial_hard_cap, }, results: all_results.clone(), summary: MemorySummary { total_tests: total_count, passed: passed_count, failed: total_count - passed_count, all_passed, }, }; let report_path = workspace_root.join("memory-report.json"); fs::write(&report_path, serde_json::to_string_pretty(&report)?)?; println!("\nReport written to: {}", report_path.display()); Ok(()) } /// Get the current git commit SHA fn get_commit_sha() -> Result, Box> { let workspace_root = find_workspace_root(); let output = Command::new("git") .args(["rev-parse", "HEAD"]) .current_dir(&workspace_root) .output()?; if output.status.success() { let sha = String::from_utf8_lossy(&output.stdout).trim().to_string(); Ok(Some(sha)) } else { Ok(None) } } /// Measure memory usage during extraction of a PDF file /// /// Uses Linux-specific /proc/[pid]/status to sample peak RSS. /// Falls back to time measurement if RSS sampling is unavailable. /// /// # Arguments /// * `binary_path` - Path to the pdftract binary /// * `pdf_path` - Path to the PDF file to extract /// * `budgets` - Memory budgets (unused but kept for compatibility) /// * `streaming` - If true, use streaming/text mode for lower memory; otherwise buffered JSON mode fn measure_extraction( binary_path: &Path, pdf_path: &Path, _budgets: &MemoryBudget, streaming: bool, ) -> Result> { let start = Instant::now(); // Spawn the extraction process and measure its peak RSS #[cfg(target_os = "linux")] { use std::os::unix::process::CommandExt; let mut cmd = Command::new(binary_path); if streaming { // Streaming mode: use --format text for lower memory footprint // Note: --format ndjson is not yet exposed in CLI (Phase 6.2) // Using text format as a reasonable proxy for streaming memory behavior cmd.arg("extract").arg("--format").arg("text"); } else { // Buffered mode: use --format json for full document buffering cmd.arg("extract").arg("--format").arg("json"); } cmd.arg(pdf_path) .stdout(Stdio::null()) .stderr(Stdio::piped()) .process_group(0); let mut child = cmd.spawn()?; let pid = child.id(); let mut peak_rss_kb = 0usize; // Sample RSS every 10ms while process runs let sample_interval = Duration::from_millis(10); loop { // Try to wait for the process (non-blocking) match child.try_wait() { Ok(Some(status)) => { // Process has exited let duration = start.elapsed(); // Capture stderr for error messages let stderr_output = if let Some(mut stderr) = child.stderr { let mut error_text = String::new(); use std::io::Read; let _ = stderr.read_to_string(&mut error_text); error_text } else { String::new() }; // Trim error text and use it if non-empty let error_message = if !status.success() { if !stderr_output.is_empty() { Some(stderr_output.trim().to_string()) } else { Some(format!("exit code: {:?}", status.code())) } } else { None }; return Ok(MemoryMeasurement { peak_rss_mb: peak_rss_kb / 1024, duration_ms: duration.as_millis(), succeeded: status.success(), error_message, }); } Ok(None) => { // Process still running, sample RSS if let Ok(rss_kb) = sample_rss(pid) { peak_rss_kb = peak_rss_kb.max(rss_kb); } std::thread::sleep(sample_interval); } Err(e) => { return Err(format!("Failed to wait for process: {}", e).into()); } } } } // Fallback for non-Linux platforms #[cfg(not(target_os = "linux"))] { let mut cmd = Command::new(binary_path); if streaming { cmd.arg("extract").arg("--format").arg("text"); } else { cmd.arg("extract").arg("--format").arg("json"); } cmd.arg(pdf_path) .stdout(Stdio::null()) .stderr(Stdio::piped()); let output = cmd.output()?; let duration = start.elapsed(); Ok(MemoryMeasurement { peak_rss_mb: 0, // Cannot measure on this platform duration_ms: duration.as_millis(), succeeded: output.status.success(), error_message: if !output.status.success() { Some(format!("exit code: {:?}", output.status.code())) } else { None }, }) } } /// Sample the current RSS (Resident Set Size) of a process in KB #[cfg(target_os = "linux")] fn sample_rss(pid: u32) -> Result> { let status_path = format!("/proc/{}/status", pid); let status = fs::read_to_string(&status_path)?; // Parse VmRSS from /proc/[pid]/status // Format: VmRSS: 12345 kB for line in status.lines() { if line.starts_with("VmRSS:") { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let rss_kb = parts[1].parse::()?; return Ok(rss_kb); } } } Err("VmRSS not found in /proc status".into()) } /// Generate page classification test fixtures /// /// Creates 4 fixture types for testing page classification: /// - vector_pure: Pure text PDF (born-digital) /// - scanned_single: Image-only PDF (scanned page) /// - brokenvector_pdfa: Invisible text layer over scanned image /// - hybrid_header_body: Text header + scanned body fn generate_page_class_fixtures() -> Result<(), Box> { println!("=========================================="); println!("Generating Page Classification Fixtures"); println!("=========================================="); let workspace_root = find_workspace_root(); let fixtures_dir = workspace_root.join("tests/fixtures/page_class"); fs::create_dir_all(&fixtures_dir)?; // 1. Vector pure: Born-digital text PDF println!("\n1. Generating vector_pure fixture..."); let vector_dir = fixtures_dir.join("vector_pure"); fs::create_dir_all(&vector_dir)?; generate_vector_pure_pdf(&vector_dir)?; // 2. Scanned single: Image-only PDF println!("2. Generating scanned_single fixture..."); let scanned_dir = fixtures_dir.join("scanned_single"); fs::create_dir_all(&scanned_dir)?; generate_scanned_single_pdf(&scanned_dir)?; // 3. BrokenVector: Invisible text + image println!("3. Generating brokenvector_pdfa fixture..."); let broken_dir = fixtures_dir.join("brokenvector_pdfa"); fs::create_dir_all(&broken_dir)?; generate_brokenvector_pdf(&broken_dir)?; // 4. Hybrid: Text header + scanned body println!("4. Generating hybrid_header_body fixture..."); let hybrid_dir = fixtures_dir.join("hybrid_header_body"); fs::create_dir_all(&hybrid_dir)?; generate_hybrid_pdf(&hybrid_dir)?; println!("\n=========================================="); println!("Page Classification Fixtures Generated"); println!("=========================================="); // Print sizes for fixture_name in &[ "vector_pure", "scanned_single", "brokenvector_pdfa", "hybrid_header_body", ] { let fixture_dir = fixtures_dir.join(fixture_name); let pdf_path = fixture_dir.join("source.pdf"); if let Ok(metadata) = fs::metadata(&pdf_path) { let size_kb = metadata.len() as f64 / 1024.0; println!(" - {}/source.pdf: {:.2} KB", fixture_name, size_kb); } } Ok(()) } /// Generate a pure vector PDF (born-digital text) fn generate_vector_pure_pdf(dir: &Path) -> Result<(), Box> { use lopdf::{Dictionary, Document, Object, Stream}; let mut doc = Document::with_version("1.5"); // Create font let mut font_dict = Dictionary::new(); font_dict.set("Type", "Font"); font_dict.set("Subtype", "Type1"); font_dict.set("BaseFont", "Helvetica"); let font_id = doc.add_object(font_dict); // Resources let mut resources = Dictionary::new(); let mut font_resources = Dictionary::new(); font_resources.set("F1", font_id); resources.set("Font", font_resources); // Content stream: Multiple lines of text with high character count let content_text = r#" BT /F1 12 Tf 50 750 Td (This is a born-digital PDF with pure vector text.) Tj 0 -15 Td (It contains multiple text operators and high character validity.) Tj 0 -15 Td (The classification should detect this as a Vector page.) Tj 0 -15 Td (Lorem ipsum dolor sit amet, consectetur adipiscing elit.) Tj 0 -15 Td (Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.) Tj 0 -15 Td (Ut enim ad minim veniam, quis nostrud exercitation ullamco.) Tj 0 -15 Td (Duis aute irure dolor in reprehenderit in voluptate velit esse.) Tj 0 -15 Td (Excepteur sint occaecat cupidatat non proident sunt in culpa.) Tj ET "#; let content_bytes = content_text.as_bytes(); let mut content_dict = Dictionary::new(); content_dict.set("Length", content_bytes.len() as i32); let content_stream = Stream::new(content_dict, content_bytes.to_vec()); let content_id = doc.add_object(content_stream); // Page dictionary let page_dict = dictionary! { "Type" => "Page", "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()], "Contents" => content_id, "Resources" => resources, "CropBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()], }; let page_id = doc.add_object(page_dict); // Pages tree let pages_id = doc.add_object(dictionary! { "Type" => "Pages", "Count" => 1, "Kids" => vec![page_id.into()], }); // Update page with parent reference let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?; page_obj.set("Parent", pages_id); doc.objects.insert(page_id, Object::Dictionary(page_obj)); // Catalog let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id, }); doc.trailer.set("Root", catalog_id); // Save PDF let pdf_path = dir.join("source.pdf"); doc.save(&pdf_path)?; // Generate expected.json let expected = PageClassExpected { class: "Vector".to_string(), confidence_min: 0.90, hybrid_cells: None, }; let json_path = dir.join("expected.json"); fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?; println!( " Created: {}/source.pdf ({:.2} KB)", dir.file_name().unwrap().to_string_lossy(), fs::metadata(&pdf_path)?.len() as f64 / 1024.0 ); Ok(()) } /// Generate an image-only scanned PDF fn generate_scanned_single_pdf(dir: &Path) -> Result<(), Box> { use lopdf::{Dictionary, Document, Object, Stream}; let mut doc = Document::with_version("1.5"); // Create a simple 1x1 pixel white image (minimal image object) let image_data = vec![0u8; 4]; // 1x1 white pixel in RGB let image_stream = Stream::new( dictionary! { "Type" => "XObject", "Subtype" => "Image", "Width" => 1, "Height" => 1, "BitsPerComponent" => 8, "ColorSpace" => "DeviceRGB", "Length" => image_data.len() as i32, }, image_data, ); let image_id = doc.add_object(image_stream); // Resources with image let mut resources = Dictionary::new(); let mut xobject = Dictionary::new(); xobject.set("Im1", image_id); resources.set("XObject", xobject); // Content stream: Draw image covering most of the page let content_text = r#" q 612 792 scale /Im1 Do Q "#; let content_bytes = content_text.as_bytes(); let mut content_dict = Dictionary::new(); content_dict.set("Length", content_bytes.len() as i32); let content_stream = Stream::new(content_dict, content_bytes.to_vec()); let content_id = doc.add_object(content_stream); // Page dictionary let page_dict = dictionary! { "Type" => "Page", "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()], "Contents" => content_id, "Resources" => resources, }; let page_id = doc.add_object(page_dict); // Pages tree let pages_id = doc.add_object(dictionary! { "Type" => "Pages", "Count" => 1, "Kids" => vec![page_id.into()], }); // Update page with parent reference let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?; page_obj.set("Parent", pages_id); doc.objects.insert(page_id, Object::Dictionary(page_obj)); // Catalog let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id, }); doc.trailer.set("Root", catalog_id); // Save PDF let pdf_path = dir.join("source.pdf"); doc.save(&pdf_path)?; // Generate expected.json let expected = PageClassExpected { class: "Scanned".to_string(), confidence_min: 0.90, hybrid_cells: None, }; let json_path = dir.join("expected.json"); fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?; println!( " Created: {}/source.pdf ({:.2} KB)", dir.file_name().unwrap().to_string_lossy(), fs::metadata(&pdf_path)?.len() as f64 / 1024.0 ); Ok(()) } /// Generate a BrokenVector PDF (invisible text + image) fn generate_brokenvector_pdf(dir: &Path) -> Result<(), Box> { use lopdf::{Dictionary, Document, Object, Stream}; let mut doc = Document::with_version("1.5"); // Create font let mut font_dict = Dictionary::new(); font_dict.set("Type", "Font"); font_dict.set("Subtype", "Type1"); font_dict.set("BaseFont", "Helvetica"); let font_id = doc.add_object(font_dict); // Create a 1x1 white pixel image let image_data = vec![255u8; 4]; let image_stream = Stream::new( dictionary! { "Type" => "XObject", "Subtype" => "Image", "Width" => 1, "Height" => 1, "BitsPerComponent" => 8, "ColorSpace" => "DeviceRGB", "Length" => image_data.len() as i32, }, image_data, ); let image_id = doc.add_object(image_stream); // Resources let mut resources = Dictionary::new(); let mut font_resources = Dictionary::new(); font_resources.set("F1", font_id); resources.set("Font", font_resources); let mut xobject = Dictionary::new(); xobject.set("Im1", image_id); resources.set("XObject", xobject); // Content stream: Invisible text (Tr=3) + full-page image // The text is there but invisible, simulating a bad OCR overlay let content_text = r#" BT /F1 12 Tf 50 750 Td 3 Tr (This text is invisible Tr=3 overlay over scanned image.) Tj 0 -15 Td (It represents a broken vector PDF with bad OCR layer.) Tj 0 -15 Td (Classification should detect this as BrokenVector.) Tj ET q 612 792 scale /Im1 Do Q "#; let content_bytes = content_text.as_bytes(); let mut content_dict = Dictionary::new(); content_dict.set("Length", content_bytes.len() as i32); let content_stream = Stream::new(content_dict, content_bytes.to_vec()); let content_id = doc.add_object(content_stream); // Page dictionary let page_dict = dictionary! { "Type" => "Page", "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()], "Contents" => content_id, "Resources" => resources, }; let page_id = doc.add_object(page_dict); // Pages tree let pages_id = doc.add_object(dictionary! { "Type" => "Pages", "Count" => 1, "Kids" => vec![page_id.into()], }); // Update page with parent reference let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?; page_obj.set("Parent", pages_id); doc.objects.insert(page_id, Object::Dictionary(page_obj)); // Catalog let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id, }); doc.trailer.set("Root", catalog_id); // Save PDF let pdf_path = dir.join("source.pdf"); doc.save(&pdf_path)?; // Generate expected.json let expected = PageClassExpected { class: "BrokenVector".to_string(), confidence_min: 0.90, hybrid_cells: None, }; let json_path = dir.join("expected.json"); fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?; println!( " Created: {}/source.pdf ({:.2} KB)", dir.file_name().unwrap().to_string_lossy(), fs::metadata(&pdf_path)?.len() as f64 / 1024.0 ); Ok(()) } /// Generate a Hybrid PDF (text header + scanned body) fn generate_hybrid_pdf(dir: &Path) -> Result<(), Box> { use lopdf::{Dictionary, Document, Object, Stream}; let mut doc = Document::with_version("1.5"); // Create font let mut font_dict = Dictionary::new(); font_dict.set("Type", "Font"); font_dict.set("Subtype", "Type1"); font_dict.set("BaseFont", "Helvetica"); let font_id = doc.add_object(font_dict); // Create a 1x1 white pixel image for the body let image_data = vec![255u8; 4]; let image_stream = Stream::new( dictionary! { "Type" => "XObject", "Subtype" => "Image", "Width" => 1, "Height" => 1, "BitsPerComponent" => 8, "ColorSpace" => "DeviceRGB", "Length" => image_data.len() as i32, }, image_data, ); let image_id = doc.add_object(image_stream); // Resources let mut resources = Dictionary::new(); let mut font_resources = Dictionary::new(); font_resources.set("F1", font_id); resources.set("Font", font_resources); let mut xobject = Dictionary::new(); xobject.set("Im1", image_id); resources.set("XObject", xobject); // Content stream: Text header (top 25%) + image body (bottom 75%) // Header: visible text in the top portion // Body: image covering the bottom portion let content_text = r#" BT /F1 14 Tf 50 750 Td (This is a HYBRID document with vector text header) Tj 0 -20 Td (The header contains selectable text) Tj 0 -20 Td (Below this header is a scanned image body) Tj ET q 0 0 612 560 re W n 612 792 scale /Im1 Do Q "#; let content_bytes = content_text.as_bytes(); let mut content_dict = Dictionary::new(); content_dict.set("Length", content_bytes.len() as i32); let content_stream = Stream::new(content_dict, content_bytes.to_vec()); let content_id = doc.add_object(content_stream); // Page dictionary let page_dict = dictionary! { "Type" => "Page", "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()], "Contents" => content_id, "Resources" => resources, }; let page_id = doc.add_object(page_dict); // Pages tree let pages_id = doc.add_object(dictionary! { "Type" => "Pages", "Count" => 1, "Kids" => vec![page_id.into()], }); // Update page with parent reference let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?; page_obj.set("Parent", pages_id); doc.objects.insert(page_id, Object::Dictionary(page_obj)); // Catalog let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id, }); doc.trailer.set("Root", catalog_id); // Save PDF let pdf_path = dir.join("source.pdf"); doc.save(&pdf_path)?; // Generate expected.json // For hybrid, we expect specific hybrid_cells (bottom rows of the 8x8 grid) // The image covers bottom 75% of page, which corresponds to rows 2-7 (6 rows = 48 cells) let hybrid_cells: Vec = (16..64).collect(); // rows 2-7 let expected = PageClassExpected { class: "Hybrid".to_string(), confidence_min: 0.15, hybrid_cells: Some(hybrid_cells), }; let json_path = dir.join("expected.json"); fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?; println!( " Created: {}/source.pdf ({:.2} KB)", dir.file_name().unwrap().to_string_lossy(), fs::metadata(&pdf_path)?.len() as f64 / 1024.0 ); Ok(()) } /// Generate glyph shape database from font files. /// /// This function walks a directory of font files (TrueType/OpenType), /// rasterizes every mapped glyph at 32x32 via fontdue, computes pHash /// for each, and writes the result as build/glyph-shapes.json. /// /// # Arguments /// /// * `fonts_dir` - Path to directory containing .ttf/.otf font files /// * `output_path` - Path where glyph-shapes.json will be written /// /// # Output format /// /// JSON array of entries: /// ```json /// { /// "phash_hex": "0123456789abcdef", /// "char": "A", /// "source_font": "LiberationSans-Regular.ttf", /// "frequency_rank": 1 /// } /// ``` fn gen_shape_db(fonts_dir: &str, output_path: &str) -> Result<(), Box> { println!("=========================================="); println!("Generating Glyph Shape Database"); println!("=========================================="); let workspace_root = find_workspace_root(); let fonts_path = workspace_root.join(fonts_dir); let output_file = workspace_root.join(output_path); if !fonts_path.exists() { return Err(format!("Fonts directory not found: {}", fonts_path.display()).into()); } // Create output directory if let Some(parent) = output_file.parent() { fs::create_dir_all(parent)?; } // Load character frequency data let frequency_data = load_frequency_data(&workspace_root)?; // Find all font files let font_files = find_font_files(&fonts_path)?; println!("\nFound {} font files:", font_files.len()); for font_file in &font_files { println!(" - {}", font_file.file_name().unwrap().to_string_lossy()); } // Process each font and collect glyphs let mut all_glyphs: Vec = Vec::new(); let mut seen_hashes: HashMap<(u64, char), String> = HashMap::new(); let mut collisions: Vec<(String, String, u64)> = Vec::new(); for font_file in &font_files { println!( "\nProcessing: {}", font_file.file_name().unwrap().to_string_lossy() ); // Load the font let font_bytes = fs::read(font_file)?; let font = Font::from_bytes(font_bytes.as_slice(), fontdue::FontSettings::default()) .map_err(|e| format!("Failed to load font: {}", e))?; let font_name = font_file.file_name().unwrap().to_string_lossy().to_string(); let mut glyph_count = 0; // Rasterize glyphs for all Unicode codepoints // We'll iterate over common Unicode ranges for codepoint in 0..0x10000 { let ch = match std::char::from_u32(codepoint) { Some(c) if !c.is_control() && c != '\u{FFFD}' => c, _ => continue, }; // Skip characters that are unlikely to be in fonts if should_skip_char(ch) { continue; } // Check if the font has this glyph if !has_glyph(&font, ch) { continue; } // Rasterize at 32px (scales to 32x32 bitmap) let (metrics, bitmap) = font.rasterize(ch, 32.0); // Skip empty glyphs (zero width/height) if bitmap.is_empty() || metrics.width == 0 || metrics.height == 0 { continue; } // Convert to centered 32x32 bitmap let centered = center_bitmap_32x32(&bitmap, metrics.width, metrics.height); // Compute pHash using pdftract-core's phash_glyph let phash = compute_phash(¢ered); // Get frequency rank let freq_rank = frequency_data.get(&ch).copied().unwrap_or(0); // Check for collisions let key = (phash, ch); if let Some(_other_font) = seen_hashes.get(&key) { // Same (phash, char) pair from different font - keep first continue; } // Check for cross-character collisions (same hash, different char) let mut collision_replacement = None; let mut skip_new = false; // Collect collision info first (without modifying seen_hashes) for (&(existing_hash, existing_ch), other_font_name) in seen_hashes.iter() { if existing_hash == phash && existing_ch != ch { // Different chars with same hash - keep higher frequency let freq_existing = frequency_data.get(&existing_ch).copied().unwrap_or(0); let freq_new = freq_rank; if freq_new > freq_existing { // New char has higher frequency, replace old collision_replacement = Some((existing_hash, existing_ch, other_font_name.clone())); } else { // Keep old, skip new skip_new = true; collisions.push((font_name.clone(), other_font_name.clone(), phash)); } } } // Handle collision replacement if needed if let Some((existing_hash, existing_ch, _)) = collision_replacement { all_glyphs.retain(|g| !(g.phash == existing_hash && g.ch == existing_ch)); seen_hashes.remove(&(existing_hash, existing_ch)); } if skip_new { continue; } seen_hashes.insert(key, font_name.clone()); all_glyphs.push(GlyphEntry { phash_hex: format!("{:016x}", phash), phash, ch, source_font: font_name.clone(), frequency_rank: freq_rank, }); glyph_count += 1; } println!(" Rasterized {} glyphs", glyph_count); } // Sort by pHash ascending all_glyphs.sort_by(|a, b| a.phash_hex.cmp(&b.phash_hex)); // Write output let json_output = serde_json::to_string_pretty(&all_glyphs)?; fs::write(&output_file, json_output)?; println!("\n=========================================="); println!("Shape Database Generation Complete"); println!("=========================================="); println!("\nOutput: {}", output_file.display()); println!("Total glyphs: {}", all_glyphs.len()); if !collisions.is_empty() { println!("Hash collisions: {}", collisions.len()); for (font1, font2, hash) in collisions.iter().take(10) { println!(" - {} vs {} (hash: {:016x})", font1, font2, hash); } } Ok(()) } /// Entry in the glyph shape database. #[derive(Debug, Serialize, Deserialize)] struct GlyphEntry { /// Perceptual hash as hexadecimal string phash_hex: String, /// Perceptual hash as u64 for comparison #[serde(skip)] phash: u64, /// Unicode character (escaped if needed) #[serde(rename = "char")] ch: char, /// Source font filename source_font: String, /// Unicode frequency rank (higher = more common) frequency_rank: u32, } /// Check if a font has a glyph for the given character. fn has_glyph(font: &Font, ch: char) -> bool { // fontdue provides indices for characters // If the character maps to a valid glyph index, the font has it let index = font.lookup_glyph_index(ch); index != 0 } /// Skip characters that are unlikely to be in fonts or are control characters. fn should_skip_char(ch: char) -> bool { // Skip control characters, private use, surrogates if ch.is_control() { return true; } let cp = ch as u32; // Private Use Areas if (0xE000..=0xF8FF).contains(&cp) || (0xF0000..=0xFFFFD).contains(&cp) || (0x100000..=0x10FFFD).contains(&cp) { return true; } // Surrogates if (0xD800..=0xDFFF).contains(&cp) { return true; } // Very high Unicode planes are unlikely to be in fonts if cp > 0x2FFFF { return true; } false } /// Center a glyph bitmap into a 32x32 canvas. /// /// The input bitmap is centered both horizontally and vertically, /// with zero padding. fn center_bitmap_32x32(bitmap: &[u8], width: usize, height: usize) -> [u8; 1024] { let mut centered = [0u8; 1024]; if width == 0 || height == 0 || bitmap.is_empty() { return centered; } // Calculate offsets to center the bitmap let x_offset = (32 - width) / 2; let y_offset = (32 - height) / 2; // Copy bitmap into centered position for y in 0..height.min(32) { for x in 0..width.min(32) { let src_idx = y * width + x; if src_idx < bitmap.len() { let dst_y = y_offset + y; let dst_x = x_offset + x; if dst_y < 32 && dst_x < 32 { let dst_idx = dst_y * 32 + dst_x; centered[dst_idx] = bitmap[src_idx]; } } } } centered } /// Compute pHash for a 32x32 grayscale bitmap. /// /// This is a wrapper around pdftract-core's phash_glyph function. fn compute_phash(bitmap: &[u8; 1024]) -> u64 { // For now, we'll compute a simple hash // In the future, we'd use pdftract-core::font::shape::phash_glyph // but that's not accessible from xtask due to dependency direction // Simple DCT-based pHash implementation // TODO: Integrate with pdftract-core's phash_glyph once accessible simple_phash(bitmap) } /// Simple pHash implementation for xtask. /// /// This is a fallback until we can properly integrate with pdftract-core's phash. fn simple_phash(bitmap: &[u8; 1024]) -> u64 { // Convert to centered floats let mut input = [0.0f32; 1024]; for i in 0..1024 { input[i] = (bitmap[i] as f32) / 127.5 - 1.0; } // Apply 2D DCT let mut dct_output = [0.0f32; 1024]; simple_dct_2d(&input, &mut dct_output); // Extract 8x8 low-frequency coefficients let mut low_freq = [0.0f32; 64]; let mut idx = 0; for y in 0..8 { for x in 0..8 { if x == 0 && y == 0 { low_freq[idx] = dct_output[8].abs(); // Skip DC, use [0,8] } else { low_freq[idx] = dct_output[y * 32 + x].abs(); } idx += 1; } } // Compute median let mut sorted = low_freq; sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); let median = (sorted[31] + sorted[32]) / 2.0; // Threshold to 64-bit hash let mut hash: u64 = 0; for (i, &val) in low_freq.iter().enumerate() { if val > median { hash |= 1 << i; } } hash } /// Simple 2D DCT-II implementation. fn simple_dct_2d(input: &[f32; 1024], output: &mut [f32; 1024]) { let mut temp = [0.0f32; 1024]; // Precompute cosine basis let mut basis = [[0.0f32; 32]; 32]; for (k, row) in basis.iter_mut().enumerate() { for (n, val) in row.iter_mut().enumerate() { *val = (std::f32::consts::PI * k as f32 * (2 * n + 1) as f32 / 64.0).cos(); } } // Row-wise DCT for y in 0..32 { for k in 0..32 { let mut sum = 0.0f32; for n in 0..32 { sum += input[y * 32 + n] * basis[k][n]; } let scale: f32 = if k == 0 { (1.0_f32 / 32.0_f32).sqrt() } else { (2.0_f32 / 32.0_f32).sqrt() }; temp[y * 32 + k] = sum * scale; } } // Column-wise DCT for x in 0..32 { for k in 0..32 { let mut sum = 0.0f32; for n in 0..32 { sum += temp[n * 32 + x] * basis[k][n]; } let scale: f32 = if k == 0 { (1.0_f32 / 32.0_f32).sqrt() } else { (2.0_f32 / 32.0_f32).sqrt() }; output[k * 32 + x] = sum * scale; } } } /// Load character frequency data. /// /// Returns a map from character to frequency rank (higher = more common). fn load_frequency_data( workspace_root: &Path, ) -> Result, Box> { let frequency_path = workspace_root.join("build").join("frequency.json"); // If frequency file doesn't exist, return empty map if !frequency_path.exists() { println!( "Warning: frequency.json not found at {}", frequency_path.display() ); println!("Using zero frequency rank for all characters."); return Ok(HashMap::new()); } let content = fs::read_to_string(&frequency_path)?; let data: serde_json::Value = serde_json::from_str(&content)?; let mut frequency = HashMap::new(); // Parse frequency data // Expected format: {"A": 1, "B": 2, ...} or array of objects if let Some(obj) = data.as_object() { for (key, value) in obj { if let Some(rank) = value.as_u64() { if let Some(ch) = key.chars().next() { frequency.insert(ch, rank as u32); } } } } println!("Loaded frequency data for {} characters", frequency.len()); Ok(frequency) } /// Find all font files in a directory. fn find_font_files(dir: &Path) -> Result, Box> { let mut font_files = Vec::new(); for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); if path.is_dir() { // Recursively search subdirectories font_files.extend(find_font_files(&path)?); } else { let ext = path.extension().and_then(|s| s.to_str()); if ext == Some("ttf") || ext == Some("otf") { font_files.push(path); } } } font_files.sort(); Ok(font_files) } /// Expected page classification for a fixture #[derive(Debug, Serialize)] struct PageClassExpected { /// Expected class name (Vector, Scanned, Hybrid, BrokenVector) class: String, /// Minimum confidence threshold (actual confidence may vary slightly) confidence_min: f32, /// For Hybrid pages: expected scanned cell indexes hybrid_cells: Option>, }