pdftract/xtask/src/main.rs

use fontdue::Font;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap};
use std::fs;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::{Duration, Instant};

/// Helper macro for creating dictionaries
macro_rules! dictionary {
    ($( $key:literal => $value:expr ),* $(,)?) => {{
        let mut dict = Dictionary::new();
        $(
            dict.set($key, $value);
        )*
        dict
    }};
}

/// Find the workspace root directory by searching for Cargo.toml
fn find_workspace_root() -> PathBuf {
    let mut current = std::env::current_dir().unwrap();

    // If we're in the xtask directory, go to parent
    if current.ends_with("xtask") {
        current = current.parent().unwrap().to_path_buf();
    }

    // Search upward for Cargo.toml with workspace members
    loop {
        let cargo_toml = current.join("Cargo.toml");
        if cargo_toml.exists() {
            let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
            if content.contains("[workspace]") {
                return current;
            }
        }

        match current.parent() {
            Some(parent) => current = parent.to_path_buf(),
            None => break,
        }
    }

    // Fallback: use current directory if not found
    std::env::current_dir().unwrap()
}

#[derive(Debug, Deserialize)]
struct Profile {
    description: String,
    #[serde(default)]
    profile_fields: BTreeMap<String, ProfileField>,
    #[serde(default)]
    r#match: MatchConfig,
}

#[derive(Debug, Deserialize)]
struct ProfileField {
    #[serde(rename = "type")]
    field_type: String,
    #[serde(default)]
    extraction: ExtractionConfig,
}

#[derive(Debug, Deserialize, Default)]
struct ExtractionConfig {
    #[serde(default)]
    patterns: Vec<String>,
    #[serde(default)]
    region_hint: Option<String>,
    #[serde(default)]
    table_region: Option<String>,
    #[serde(default)]
    columnar_regions: Option<String>,
    #[serde(default)]
    per_page: Option<bool>,
    #[serde(default)]
    #[allow(dead_code)]
    fallback: serde_yaml::Value,
}

#[derive(Debug, Deserialize, Default)]
struct MatchConfig {
    #[serde(default)]
    any: Vec<MatchClause>,
}

#[derive(Debug, Deserialize, Default)]
struct MatchClause {
    #[serde(default)]
    text_patterns: Vec<String>,
    #[serde(default)]
    structural: Vec<serde_yaml::Value>,
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let args: Vec<String> = std::env::args().collect();

    if args.len() < 2 {
        eprintln!("Usage: xtask <command>");
        eprintln!("Commands:");
        eprintln!("  doc-profile <profile-name>      Generate README skeleton for a profile");
        eprintln!("  doc-profiles                     Generate README skeletons for all profiles");
        eprintln!("  generate-stress-pdfs            Generate stress-test PDFs for memory ceiling testing");
        eprintln!("  generate-page-class-fixtures    Generate page classification test fixtures");
        eprintln!("  gen-schema                      Generate JSON Schema from Rust output types");
        eprintln!(
            "  gen-shape-db                    Generate glyph shape database from font files"
        );
        eprintln!("  memory-ceiling                  Run memory ceiling tests against perf/malformed corpora");
        std::process::exit(1);
    }

    let result = match args[1].as_str() {
        "doc-profile" => {
            if args.len() < 3 {
                eprintln!("Usage: xtask doc-profile <profile-name>");
                std::process::exit(1);
            }
            generate_profile_readme(&args[2])?;
            Ok(())
        }
        "doc-profiles" => {
            let profiles_dir = find_workspace_root().join("profiles/builtin");
            for entry in fs::read_dir(&profiles_dir)? {
                let entry = entry?;
                if entry.path().is_dir() {
                    let profile_name = entry.file_name().to_string_lossy().to_string();
                    if let Err(e) = generate_profile_readme(&profile_name) {
                        eprintln!("Error generating README for {}: {}", profile_name, e);
                    }
                }
            }
            Ok(())
        }
        "generate-stress-pdfs" => {
            generate_stress_pdfs()?;
            Ok(())
        }
        "generate-page-class-fixtures" => {
            generate_page_class_fixtures()?;
            Ok(())
        }
        "gen-schema" => {
            gen_schema()?;
            Ok(())
        }
        "memory-ceiling" => {
            run_memory_ceiling_tests()?;
            Ok(())
        }
        "gen-shape-db" => {
            let fonts_dir = if args.len() >= 3 {
                args[2].clone()
            } else {
                eprintln!("Usage: xtask gen-shape-db <fonts-dir>");
                std::process::exit(1);
            };
            let output_path = if args.len() >= 4 {
                args[3].clone()
            } else {
                "build/glyph-shapes.json".to_string()
            };
            gen_shape_db(&fonts_dir, &output_path)?;
            Ok(())
        }
        _ => {
            eprintln!("Unknown command: {}", args[1]);
            std::process::exit(1);
        }
    };

    result
}

/// Generate JSON Schema from Rust output types.
///
/// Delegates to the gen_schema binary.
fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
    // Invoke the gen_schema binary
    let status = std::process::Command::new("cargo")
        .args(["run", "--bin", "gen_schema"])
        .current_dir(find_workspace_root())
        .status()?;

    if !status.success() {
        return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
    }

    Ok(())
}

fn generate_profile_readme(profile_name: &str) -> Result<(), Box<dyn std::error::Error>> {
    // Find the workspace root by looking for the parent directory's Cargo.toml
    let workspace_root = find_workspace_root();
    let profile_path = workspace_root
        .join("profiles/builtin")
        .join(profile_name)
        .join("profile.yaml");
    let readme_path = workspace_root
        .join("profiles/builtin")
        .join(profile_name)
        .join("README.md");

    if !profile_path.exists() {
        return Err(format!("Profile YAML not found: {}", profile_path.display()).into());
    }

    let yaml_content = fs::read_to_string(&profile_path)?;
    let profile: Profile = serde_yaml::from_str(&yaml_content)?;

    let mut readme = String::new();

    // Title and description
    readme.push_str(&format!("# {} Profile\n\n", profile_name.to_uppercase()));
    readme.push_str(&format!("{}\n\n", profile.description));

    // Match Criteria Summary (placeholder for human to fill)
    readme.push_str("## Match Criteria Summary\n\n");
    readme.push_str("*This section describes the characteristics that cause a document to match this profile. The following signals are considered:*\n\n");

    // Collect all text patterns and structural signals from any clause
    let mut all_patterns: Vec<&String> = Vec::new();
    let mut all_structural: Vec<String> = Vec::new();

    for clause in &profile.r#match.any {
        for pattern in &clause.text_patterns {
            if !all_patterns.contains(&pattern) {
                all_patterns.push(pattern);
            }
        }
        for signal in &clause.structural {
            let signal_str = format!("{:?}", signal);
            if !all_structural.iter().any(|s| s == &signal_str) {
                all_structural.push(signal_str);
            }
        }
    }

    // Show first few patterns as examples
    if !all_patterns.is_empty() {
        let show_count = all_patterns.len().min(3);
        readme.push_str("- **Text patterns**: ");
        for (i, pattern) in all_patterns.iter().take(show_count).enumerate() {
            if i > 0 {
                readme.push_str(", ");
            }
            readme.push_str(&format!("`{}`", pattern));
        }
        if all_patterns.len() > show_count {
            readme.push_str(&format!(" ({} more)", all_patterns.len() - show_count));
        }
        readme.push('\n');
    }

    if !all_structural.is_empty() {
        let show_count = all_structural.len().min(3);
        readme.push_str("- **Structural signals**: ");
        for (i, signal) in all_structural.iter().take(show_count).enumerate() {
            if i > 0 {
                readme.push_str(", ");
            }
            readme.push_str(&format!("`{}`", signal));
        }
        if all_structural.len() > show_count {
            readme.push_str(&format!(" ({} more)", all_structural.len() - show_count));
        }
        readme.push('\n');
    }

    readme.push_str(
        "\n*Additional heuristics and confidence scoring are applied during classification.*\n\n",
    );

    // Extracted Fields
    readme.push_str("## Extracted Fields\n\n");
    readme.push_str("| Field | Type | Description | Example Value | Source Hint |\n");
    readme.push_str("|-------|------|-------------|----------------|-------------|\n");

    for (field_name, field) in &profile.profile_fields {
        let description = "Extracted from page text using pattern matching".to_string();
        let example = match field.field_type.as_str() {
            "string" => "\"example value\"",
            "decimal" => "123.45",
            "date" => "2024-01-15",
            "int" => "42",
            "array" => "[...]",
            _ => "N/A",
        };
        let mut source_parts = Vec::new();
        if !field.extraction.patterns.is_empty() {
            source_parts.push("regex patterns".to_string());
        }
        if let Some(ref hint) = field.extraction.region_hint {
            source_parts.push(format!("region: {}", hint));
        }
        if let Some(ref table) = field.extraction.table_region {
            source_parts.push(format!("table: {}", table));
        }
        if let Some(ref cols) = field.extraction.columnar_regions {
            source_parts.push(format!("columns: {}", cols));
        }
        if field.extraction.per_page.unwrap_or(false) {
            source_parts.push("per-page".to_string());
        }
        let source = if source_parts.is_empty() {
            "profile YAML".to_string()
        } else {
            source_parts.join(", ")
        };
        readme.push_str(&format!(
            "| {} | {} | {} | {} | {} |\n",
            field_name, field.field_type, description, example, source
        ));
    }

    if profile.profile_fields.is_empty() {
        readme.push_str("| *(none)* | - | *This profile has no field extractors* | - | - |\n");
    }

    readme.push('\n');

    // Known Limitations
    readme.push_str("## Known Limitations\n\n");
    readme.push_str("*This section documents known edge cases and failure modes. Contributions to improve extraction quality are welcome.*\n\n");
    readme.push_str("- *Document limitations and edge cases to be added by profile author*\n\n");

    // Sample Input Pointer
    readme.push_str("## Sample Input\n\n");
    readme.push_str(&format!("Example fixtures demonstrating this profile are available in `tests/fixtures/profiles/{}/`.\n\n", profile_name));
    readme.push_str("*See the classifier corpus for representative documents.*\n\n");

    // Configuration Tips
    readme.push_str("## Configuration Tips\n\n");
    readme.push_str("To override this profile:\n\n");
    readme.push_str("```bash\n");
    readme.push_str(&format!(
        "pdftract profiles export {} > my-profile.yaml\n",
        profile_name
    ));
    readme.push_str(
        "# Edit my-profile.yaml to customize match criteria, fields, or extraction patterns\n",
    );
    readme.push_str("pdftract extract --profile my-profile.yaml document.pdf\n");
    readme.push_str("```\n\n");

    // Footer
    readme.push_str("---\n\n*This README was auto-generated from `profile.yaml`. Update the Match Criteria Summary and Known Limitations sections with profile-specific guidance.*\n");

    fs::write(&readme_path, readme)?;
    println!(
        "Generated README for {} at {}",
        profile_name,
        readme_path.display()
    );

    Ok(())
}

/// Generate stress-test PDFs for memory ceiling testing
///
/// Creates large-page-count PDFs to validate memory targets:
/// - 100-page vector PDF for buffered mode testing (target: < 512 MB)
/// - 10,000-page stress test for streaming mode validation (target: < 256 MB)
fn generate_stress_pdfs() -> Result<(), Box<dyn std::error::Error>> {
    println!("==========================================");
    println!("Generating Stress-Test PDFs");
    println!("==========================================");

    let workspace_root = find_workspace_root();
    let perf_dir = workspace_root.join("tests/fixtures/perf");
    fs::create_dir_all(&perf_dir)?;

    let configs = vec![
        (
            100,
            "100-page-vector.pdf",
            "Buffered mode stress test (512 MB budget)",
        ),
        (
            10000,
            "10k-page.pdf",
            "Streaming mode stress test (256 MB budget)",
        ),
    ];

    for (num_pages, filename, description) in &configs {
        println!("\nGenerating: {} ({} pages)", filename, num_pages);
        println!("  Purpose: {}", description);

        let output_path = perf_dir.join(filename);
        generate_stress_pdf(&output_path, *num_pages)?;
    }

    println!("\n==========================================");
    println!("Stress-Test PDF Generation Complete");
    println!("==========================================");
    println!("\nGenerated files:");
    for (_, filename, _) in &configs {
        let path = perf_dir.join(filename);
        if path.exists() {
            let metadata = fs::metadata(&path)?;
            let size_mb = metadata.len() as f64 / 1024.0 / 1024.0;
            println!("  - {} ({:.2} MB)", filename, size_mb);
        }
    }

    Ok(())
}

/// Generate a multi-page stress-test PDF
///
/// Creates a PDF with the specified number of pages for memory ceiling testing.
/// Uses a minimal approach with lopdf 0.34.
fn generate_stress_pdf(
    output_path: &Path,
    num_pages: usize,
) -> Result<(), Box<dyn std::error::Error>> {
    use lopdf::{Dictionary, Document, Object, Stream};

    let mut doc = Document::with_version("1.5");

    // Pre-create fonts and resources that will be reused
    let mut font_dict = Dictionary::new();
    font_dict.set("Type", "Font");
    font_dict.set("Subtype", "Type1");
    font_dict.set("BaseFont", "Helvetica");
    let font_id = doc.add_object(font_dict);

    let mut resources = Dictionary::new();
    let mut font_resources = Dictionary::new();
    font_resources.set("F1", font_id);
    resources.set("Font", font_resources);

    // Create all page objects first
    let mut page_ids = Vec::new();
    let mediabox = Object::Array(vec![
        Object::Real(0.0),
        Object::Real(0.0),
        Object::Real(612.0),
        Object::Real(792.0),
    ]);

    for page_num in 1..=num_pages {
        // Create content stream for this page
        let content_bytes = format!(
            "BT /F1 12 Tf 72 720 Td (Page {} of {}) Tj ET",
            page_num, num_pages
        )
        .into_bytes();

        let mut content_dict = Dictionary::new();
        content_dict.set("Length", content_bytes.len() as i32);
        let content_stream = Stream::new(content_dict, content_bytes);
        let content_id = doc.add_object(content_stream);

        // Create page dictionary
        let mut page_dict = Dictionary::new();
        page_dict.set("Type", "Page");
        page_dict.set("MediaBox", mediabox.clone());
        page_dict.set("Contents", content_id);
        page_dict.set("Resources", resources.clone());

        let page_id = doc.add_object(page_dict);
        page_ids.push(page_id);
    }

    // Create the Pages root dictionary (Pages tree)
    let mut pages_dict = Dictionary::new();
    pages_dict.set("Type", "Pages");
    pages_dict.set("Count", Object::Integer(num_pages as i64));
    pages_dict.set(
        "Kids",
        Object::Array(page_ids.iter().map(|&id| Object::Reference(id)).collect()),
    );

    let pages_id = doc.add_object(pages_dict);

    // Set Parent reference for each page
    for &page_id in &page_ids {
        let page_obj = doc.get_object(page_id)?;
        if let Ok(dict) = page_obj.as_dict() {
            let mut updated_dict = dict.clone();
            updated_dict.set("Parent", pages_id);
            // Need to replace the object
            let _ = doc
                .objects
                .insert(page_id, Object::Dictionary(updated_dict));
        }
    }

    // Create the Catalog dictionary
    let mut catalog_dict = Dictionary::new();
    catalog_dict.set("Type", "Catalog");
    catalog_dict.set("Pages", pages_id);
    let catalog_id = doc.add_object(catalog_dict);

    // Set the document's catalog ID directly
    doc.trailer.set("Root", catalog_id);

    // Save the document
    doc.save(output_path)?;

    let metadata = fs::metadata(output_path)?;
    let size_mb = metadata.len() as f64 / 1024.0 / 1024.0;
    println!(
        "  Generated: {} ({:.2} MB)",
        output_path.file_name().unwrap().to_string_lossy(),
        size_mb
    );

    Ok(())
}

/// Memory budgets for different document categories (in MB)
#[derive(Debug, Clone)]
struct MemoryBudget {
    pub buffered_100_page: usize,    // 512 MB
    pub streaming_any: usize,        // 256 MB
    pub adversarial_hard_cap: usize, // 1 GB
}

impl Default for MemoryBudget {
    fn default() -> Self {
        Self {
            buffered_100_page: 512,
            streaming_any: 256,
            adversarial_hard_cap: 1024,
        }
    }
}

#[derive(Debug, Serialize)]
struct MemoryMeasurement {
    pub peak_rss_mb: usize,
    pub duration_ms: u128,
    pub succeeded: bool,
    pub error_message: Option<String>,
}

#[derive(Debug, Clone, Serialize)]
struct MemoryTestResult {
    pub file_name: String,
    pub category: String, // "buffered", "streaming", "adversarial"
    pub peak_rss_mb: usize,
    pub duration_ms: u128,
    pub budget_mb: usize,
    pub passed: bool,
    pub error_message: Option<String>,
}

#[derive(Debug, Serialize)]
struct MemoryReport {
    pub timestamp: String,
    pub commit_sha: Option<String>,
    pub budgets: MemoryBudgetJson,
    pub results: Vec<MemoryTestResult>,
    pub summary: MemorySummary,
}

#[derive(Debug, Serialize)]
struct MemoryBudgetJson {
    pub buffered_100_page_mb: usize,
    pub streaming_any_mb: usize,
    pub adversarial_hard_cap_mb: usize,
}

#[derive(Debug, Serialize)]
struct MemorySummary {
    pub total_tests: usize,
    pub passed: usize,
    pub failed: usize,
    pub all_passed: bool,
}

/// Run memory ceiling tests against perf and malformed corpora
///
/// This enforces the Tier-1 Memory targets from the plan:
/// - Peak RSS, 100-page vector PDF (buffered mode) < 512 MB
/// - Peak RSS, streaming/NDJSON mode < 256 MB
/// - Peak RSS, adversarial fixtures < 1 GB hard ceiling
///
/// Analogous to cargo-bloat for memory usage: fails the build if any
/// document exceeds its budget.
///
/// Generates memory-report.json artifact for CI historical tracking.
fn run_memory_ceiling_tests() -> Result<(), Box<dyn std::error::Error>> {
    println!("==========================================");
    println!("Memory Ceiling Tests");
    println!("==========================================");

    let budgets = MemoryBudget::default();
    let workspace_root = find_workspace_root();
    let perf_dir = workspace_root.join("tests/fixtures/perf");
    let malformed_dir = workspace_root.join("tests/fixtures/malformed");

    println!("\nMemory budgets:");
    println!("  - Buffered 100-page: {} MB", budgets.buffered_100_page);
    println!("  - Streaming mode: {} MB", budgets.streaming_any);
    println!(
        "  - Adversarial hard cap: {} MB",
        budgets.adversarial_hard_cap
    );

    // Build pdftract binary first
    println!("\n=== Building pdftract for testing ===");
    let build_status = Command::new("cargo")
        .args(["build", "--release", "--bin", "pdftract", "--locked"])
        .current_dir(&workspace_root)
        .stdout(Stdio::inherit())
        .stderr(Stdio::inherit())
        .status()?;

    if !build_status.success() {
        return Err("Failed to build pdftract binary".into());
    }

    let binary_path = workspace_root.join("target/release/pdftract");
    if !binary_path.exists() {
        return Err(format!("pdftract binary not found at {}", binary_path.display()).into());
    }

    println!("Binary: {}", binary_path.display());

    let mut all_results = Vec::new();
    let mut all_passed = true;

    // Test 1: Perf corpus - buffered mode (512 MB budget)
    println!(
        "\n=== Testing perf corpus (buffered mode, budget: {} MB) ===",
        budgets.buffered_100_page
    );

    if perf_dir.exists() {
        for entry in fs::read_dir(&perf_dir)? {
            let entry = entry?;
            let path = entry.path();

            if path.extension().and_then(|s| s.to_str()) != Some("pdf") {
                continue;
            }

            let file_name = path.file_name().unwrap().to_string_lossy().to_string();
            print!("  [buffered] {} ... ", file_name);

            match measure_extraction(&binary_path, &path, &budgets, false) {
                Ok(measurement) => {
                    let passed = measurement.peak_rss_mb <= budgets.buffered_100_page;
                    if passed {
                        println!(
                            "PASS ({} MB, {} ms)",
                            measurement.peak_rss_mb, measurement.duration_ms
                        );
                    } else {
                        println!(
                            "FAIL ({} MB > {} MB)",
                            measurement.peak_rss_mb, budgets.buffered_100_page
                        );
                        all_passed = false;
                    }
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "buffered".to_string(),
                        peak_rss_mb: measurement.peak_rss_mb,
                        duration_ms: measurement.duration_ms,
                        budget_mb: budgets.buffered_100_page,
                        passed,
                        error_message: measurement.error_message,
                    });
                }
                Err(e) => {
                    println!("ERROR ({})", e);
                    all_passed = false;
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "buffered".to_string(),
                        peak_rss_mb: 0,
                        duration_ms: 0,
                        budget_mb: budgets.buffered_100_page,
                        passed: false,
                        error_message: Some(e.to_string()),
                    });
                }
            }
        }
    } else {
        println!("  (no perf directory)");
    }

    // Test 2: Perf corpus - streaming mode (256 MB budget)
    println!(
        "\n=== Testing perf corpus (streaming mode, budget: {} MB) ===",
        budgets.streaming_any
    );

    if perf_dir.exists() {
        for entry in fs::read_dir(&perf_dir)? {
            let entry = entry?;
            let path = entry.path();

            if path.extension().and_then(|s| s.to_str()) != Some("pdf") {
                continue;
            }

            let file_name = path.file_name().unwrap().to_string_lossy().to_string();
            print!("  [streaming] {} ... ", file_name);

            match measure_extraction(&binary_path, &path, &budgets, true) {
                Ok(measurement) => {
                    let passed = measurement.peak_rss_mb <= budgets.streaming_any;
                    if passed {
                        println!(
                            "PASS ({} MB, {} ms)",
                            measurement.peak_rss_mb, measurement.duration_ms
                        );
                    } else {
                        println!(
                            "FAIL ({} MB > {} MB)",
                            measurement.peak_rss_mb, budgets.streaming_any
                        );
                        all_passed = false;
                    }
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "streaming".to_string(),
                        peak_rss_mb: measurement.peak_rss_mb,
                        duration_ms: measurement.duration_ms,
                        budget_mb: budgets.streaming_any,
                        passed,
                        error_message: measurement.error_message,
                    });
                }
                Err(e) => {
                    println!("ERROR ({})", e);
                    all_passed = false;
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "streaming".to_string(),
                        peak_rss_mb: 0,
                        duration_ms: 0,
                        budget_mb: budgets.streaming_any,
                        passed: false,
                        error_message: Some(e.to_string()),
                    });
                }
            }
        }
    }

    // Test 3: Malformed corpus - adversarial hard cap (1 GB budget)
    println!(
        "\n=== Testing malformed corpus (adversarial hard cap: {} MB) ===",
        budgets.adversarial_hard_cap
    );

    if malformed_dir.exists() {
        for entry in fs::read_dir(&malformed_dir)? {
            let entry = entry?;
            let path = entry.path();

            if path.extension().and_then(|s| s.to_str()) != Some("pdf")
                && path.extension().and_then(|s| s.to_str()) != Some("bin")
            {
                continue;
            }

            let file_name = path.file_name().unwrap().to_string_lossy().to_string();
            print!("  [adversarial] {} ... ", file_name);

            match measure_extraction(&binary_path, &path, &budgets, false) {
                Ok(measurement) => {
                    let passed = measurement.peak_rss_mb <= budgets.adversarial_hard_cap;
                    if passed {
                        println!(
                            "PASS ({} MB, {} ms)",
                            measurement.peak_rss_mb, measurement.duration_ms
                        );
                    } else {
                        println!(
                            "FAIL ({} MB > {} MB)",
                            measurement.peak_rss_mb, budgets.adversarial_hard_cap
                        );
                        all_passed = false;
                    }
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "adversarial".to_string(),
                        peak_rss_mb: measurement.peak_rss_mb,
                        duration_ms: measurement.duration_ms,
                        budget_mb: budgets.adversarial_hard_cap,
                        passed,
                        error_message: measurement.error_message,
                    });
                }
                Err(e) => {
                    println!("ERROR ({})", e);
                    all_passed = false;
                    all_results.push(MemoryTestResult {
                        file_name: file_name.clone(),
                        category: "adversarial".to_string(),
                        peak_rss_mb: 0,
                        duration_ms: 0,
                        budget_mb: budgets.adversarial_hard_cap,
                        passed: false,
                        error_message: Some(e.to_string()),
                    });
                }
            }
        }
    } else {
        println!("  (no malformed directory)");
    }

    // Print summary
    println!("\n==========================================");
    println!("Memory Ceiling Summary");
    println!("==========================================");

    let passed_count = all_results.iter().filter(|r| r.passed).count();
    let total_count = all_results.len();

    println!("Passed: {}/{}", passed_count, total_count);

    if !all_passed {
        println!("\nFailed documents:");
        for result in &all_results {
            if !result.passed {
                if result.peak_rss_mb > 0 {
                    println!(
                        "  - [{}] {} ({} MB > {} MB)",
                        result.category, result.file_name, result.peak_rss_mb, result.budget_mb
                    );
                } else {
                    println!(
                        "  - [{}] {} (error: {})",
                        result.category,
                        result.file_name,
                        result.error_message.as_deref().unwrap_or("unknown")
                    );
                }
            }
        }
        println!("\nMemory ceiling gate FAILED!");
        return Err("Memory ceiling exceeded".into());
    }

    println!("\nMemory ceiling gate PASSED!");

    // Generate JSON report
    let report = MemoryReport {
        timestamp: format!(
            "{}",
            humantime::format_rfc3339_seconds(std::time::SystemTime::now())
        ),
        commit_sha: get_commit_sha()?,
        budgets: MemoryBudgetJson {
            buffered_100_page_mb: budgets.buffered_100_page,
            streaming_any_mb: budgets.streaming_any,
            adversarial_hard_cap_mb: budgets.adversarial_hard_cap,
        },
        results: all_results.clone(),
        summary: MemorySummary {
            total_tests: total_count,
            passed: passed_count,
            failed: total_count - passed_count,
            all_passed,
        },
    };

    let report_path = workspace_root.join("memory-report.json");
    fs::write(&report_path, serde_json::to_string_pretty(&report)?)?;
    println!("\nReport written to: {}", report_path.display());

    Ok(())
}

/// Get the current git commit SHA
fn get_commit_sha() -> Result<Option<String>, Box<dyn std::error::Error>> {
    let workspace_root = find_workspace_root();
    let output = Command::new("git")
        .args(["rev-parse", "HEAD"])
        .current_dir(&workspace_root)
        .output()?;

    if output.status.success() {
        let sha = String::from_utf8_lossy(&output.stdout).trim().to_string();
        Ok(Some(sha))
    } else {
        Ok(None)
    }
}

/// Measure memory usage during extraction of a PDF file
///
/// Uses Linux-specific /proc/[pid]/status to sample peak RSS.
/// Falls back to time measurement if RSS sampling is unavailable.
///
/// # Arguments
/// * `binary_path` - Path to the pdftract binary
/// * `pdf_path` - Path to the PDF file to extract
/// * `budgets` - Memory budgets (unused but kept for compatibility)
/// * `streaming` - If true, use streaming/text mode for lower memory; otherwise buffered JSON mode
fn measure_extraction(
    binary_path: &Path,
    pdf_path: &Path,
    _budgets: &MemoryBudget,
    streaming: bool,
) -> Result<MemoryMeasurement, Box<dyn std::error::Error>> {
    let start = Instant::now();

    // Spawn the extraction process and measure its peak RSS
    #[cfg(target_os = "linux")]
    {
        use std::os::unix::process::CommandExt;

        let mut cmd = Command::new(binary_path);

        if streaming {
            // Streaming mode: use --format text for lower memory footprint
            // Note: --format ndjson is not yet exposed in CLI (Phase 6.2)
            // Using text format as a reasonable proxy for streaming memory behavior
            cmd.arg("extract").arg("--format").arg("text");
        } else {
            // Buffered mode: use --format json for full document buffering
            cmd.arg("extract").arg("--format").arg("json");
        }

        cmd.arg(pdf_path)
            .stdout(Stdio::null())
            .stderr(Stdio::piped())
            .process_group(0);

        let mut child = cmd.spawn()?;

        let pid = child.id();
        let mut peak_rss_kb = 0usize;

        // Sample RSS every 10ms while process runs
        let sample_interval = Duration::from_millis(10);
        loop {
            // Try to wait for the process (non-blocking)
            match child.try_wait() {
                Ok(Some(status)) => {
                    // Process has exited
                    let duration = start.elapsed();

                    // Capture stderr for error messages
                    let stderr_output = if let Some(mut stderr) = child.stderr {
                        let mut error_text = String::new();
                        use std::io::Read;
                        let _ = stderr.read_to_string(&mut error_text);
                        error_text
                    } else {
                        String::new()
                    };

                    // Trim error text and use it if non-empty
                    let error_message = if !status.success() {
                        if !stderr_output.is_empty() {
                            Some(stderr_output.trim().to_string())
                        } else {
                            Some(format!("exit code: {:?}", status.code()))
                        }
                    } else {
                        None
                    };

                    return Ok(MemoryMeasurement {
                        peak_rss_mb: peak_rss_kb / 1024,
                        duration_ms: duration.as_millis(),
                        succeeded: status.success(),
                        error_message,
                    });
                }
                Ok(None) => {
                    // Process still running, sample RSS
                    if let Ok(rss_kb) = sample_rss(pid) {
                        peak_rss_kb = peak_rss_kb.max(rss_kb);
                    }
                    std::thread::sleep(sample_interval);
                }
                Err(e) => {
                    return Err(format!("Failed to wait for process: {}", e).into());
                }
            }
        }
    }

    // Fallback for non-Linux platforms
    #[cfg(not(target_os = "linux"))]
    {
        let mut cmd = Command::new(binary_path);

        if streaming {
            cmd.arg("extract").arg("--format").arg("text");
        } else {
            cmd.arg("extract").arg("--format").arg("json");
        }

        cmd.arg(pdf_path)
            .stdout(Stdio::null())
            .stderr(Stdio::piped());

        let output = cmd.output()?;

        let duration = start.elapsed();

        Ok(MemoryMeasurement {
            peak_rss_mb: 0, // Cannot measure on this platform
            duration_ms: duration.as_millis(),
            succeeded: output.status.success(),
            error_message: if !output.status.success() {
                Some(format!("exit code: {:?}", output.status.code()))
            } else {
                None
            },
        })
    }
}

/// Sample the current RSS (Resident Set Size) of a process in KB
#[cfg(target_os = "linux")]
fn sample_rss(pid: u32) -> Result<usize, Box<dyn std::error::Error>> {
    let status_path = format!("/proc/{}/status", pid);
    let status = fs::read_to_string(&status_path)?;

    // Parse VmRSS from /proc/[pid]/status
    // Format: VmRSS:    12345 kB
    for line in status.lines() {
        if line.starts_with("VmRSS:") {
            let parts: Vec<&str> = line.split_whitespace().collect();
            if parts.len() >= 2 {
                let rss_kb = parts[1].parse::<usize>()?;
                return Ok(rss_kb);
            }
        }
    }

    Err("VmRSS not found in /proc status".into())
}

/// Generate page classification test fixtures
///
/// Creates 4 fixture types for testing page classification:
/// - vector_pure: Pure text PDF (born-digital)
/// - scanned_single: Image-only PDF (scanned page)
/// - brokenvector_pdfa: Invisible text layer over scanned image
/// - hybrid_header_body: Text header + scanned body
fn generate_page_class_fixtures() -> Result<(), Box<dyn std::error::Error>> {

    println!("==========================================");
    println!("Generating Page Classification Fixtures");
    println!("==========================================");

    let workspace_root = find_workspace_root();
    let fixtures_dir = workspace_root.join("tests/fixtures/page_class");
    fs::create_dir_all(&fixtures_dir)?;

    // 1. Vector pure: Born-digital text PDF
    println!("\n1. Generating vector_pure fixture...");
    let vector_dir = fixtures_dir.join("vector_pure");
    fs::create_dir_all(&vector_dir)?;
    generate_vector_pure_pdf(&vector_dir)?;

    // 2. Scanned single: Image-only PDF
    println!("2. Generating scanned_single fixture...");
    let scanned_dir = fixtures_dir.join("scanned_single");
    fs::create_dir_all(&scanned_dir)?;
    generate_scanned_single_pdf(&scanned_dir)?;

    // 3. BrokenVector: Invisible text + image
    println!("3. Generating brokenvector_pdfa fixture...");
    let broken_dir = fixtures_dir.join("brokenvector_pdfa");
    fs::create_dir_all(&broken_dir)?;
    generate_brokenvector_pdf(&broken_dir)?;

    // 4. Hybrid: Text header + scanned body
    println!("4. Generating hybrid_header_body fixture...");
    let hybrid_dir = fixtures_dir.join("hybrid_header_body");
    fs::create_dir_all(&hybrid_dir)?;
    generate_hybrid_pdf(&hybrid_dir)?;

    println!("\n==========================================");
    println!("Page Classification Fixtures Generated");
    println!("==========================================");

    // Print sizes
    for fixture_name in &[
        "vector_pure",
        "scanned_single",
        "brokenvector_pdfa",
        "hybrid_header_body",
    ] {
        let fixture_dir = fixtures_dir.join(fixture_name);
        let pdf_path = fixture_dir.join("source.pdf");
        if let Ok(metadata) = fs::metadata(&pdf_path) {
            let size_kb = metadata.len() as f64 / 1024.0;
            println!("  - {}/source.pdf: {:.2} KB", fixture_name, size_kb);
        }
    }

    Ok(())
}

/// Generate a pure vector PDF (born-digital text)
fn generate_vector_pure_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
    use lopdf::{Dictionary, Document, Object, Stream};

    let mut doc = Document::with_version("1.5");

    // Create font
    let mut font_dict = Dictionary::new();
    font_dict.set("Type", "Font");
    font_dict.set("Subtype", "Type1");
    font_dict.set("BaseFont", "Helvetica");
    let font_id = doc.add_object(font_dict);

    // Resources
    let mut resources = Dictionary::new();
    let mut font_resources = Dictionary::new();
    font_resources.set("F1", font_id);
    resources.set("Font", font_resources);

    // Content stream: Multiple lines of text with high character count
    let content_text = r#"
        BT /F1 12 Tf 50 750 Td
        (This is a born-digital PDF with pure vector text.) Tj
        0 -15 Td (It contains multiple text operators and high character validity.) Tj
        0 -15 Td (The classification should detect this as a Vector page.) Tj
        0 -15 Td (Lorem ipsum dolor sit amet, consectetur adipiscing elit.) Tj
        0 -15 Td (Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.) Tj
        0 -15 Td (Ut enim ad minim veniam, quis nostrud exercitation ullamco.) Tj
        0 -15 Td (Duis aute irure dolor in reprehenderit in voluptate velit esse.) Tj
        0 -15 Td (Excepteur sint occaecat cupidatat non proident sunt in culpa.) Tj
        ET
    "#;

    let content_bytes = content_text.as_bytes();
    let mut content_dict = Dictionary::new();
    content_dict.set("Length", content_bytes.len() as i32);
    let content_stream = Stream::new(content_dict, content_bytes.to_vec());
    let content_id = doc.add_object(content_stream);

    // Page dictionary
    let page_dict = dictionary! {
        "Type" => "Page",
        "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
        "Contents" => content_id,
        "Resources" => resources,
        "CropBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
    };
    let page_id = doc.add_object(page_dict);

    // Pages tree
    let pages_id = doc.add_object(dictionary! {
        "Type" => "Pages",
        "Count" => 1,
        "Kids" => vec![page_id.into()],
    });

    // Update page with parent reference
    let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
    page_obj.set("Parent", pages_id);
    doc.objects.insert(page_id, Object::Dictionary(page_obj));

    // Catalog
    let catalog_id = doc.add_object(dictionary! {
        "Type" => "Catalog",
        "Pages" => pages_id,
    });
    doc.trailer.set("Root", catalog_id);

    // Save PDF
    let pdf_path = dir.join("source.pdf");
    doc.save(&pdf_path)?;

    // Generate expected.json
    let expected = PageClassExpected {
        class: "Vector".to_string(),
        confidence_min: 0.90,
        hybrid_cells: None,
    };
    let json_path = dir.join("expected.json");
    fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;

    println!(
        "  Created: {}/source.pdf ({:.2} KB)",
        dir.file_name().unwrap().to_string_lossy(),
        fs::metadata(&pdf_path)?.len() as f64 / 1024.0
    );

    Ok(())
}

/// Generate an image-only scanned PDF
fn generate_scanned_single_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
    use lopdf::{Dictionary, Document, Object, Stream};

    let mut doc = Document::with_version("1.5");

    // Create a simple 1x1 pixel white image (minimal image object)
    let image_data = vec![0u8; 4]; // 1x1 white pixel in RGB
    let image_stream = Stream::new(
        dictionary! {
            "Type" => "XObject",
            "Subtype" => "Image",
            "Width" => 1,
            "Height" => 1,
            "BitsPerComponent" => 8,
            "ColorSpace" => "DeviceRGB",
            "Length" => image_data.len() as i32,
        },
        image_data,
    );
    let image_id = doc.add_object(image_stream);

    // Resources with image
    let mut resources = Dictionary::new();
    let mut xobject = Dictionary::new();
    xobject.set("Im1", image_id);
    resources.set("XObject", xobject);

    // Content stream: Draw image covering most of the page
    let content_text = r#"
        q 612 792 scale
        /Im1 Do
        Q
    "#;

    let content_bytes = content_text.as_bytes();
    let mut content_dict = Dictionary::new();
    content_dict.set("Length", content_bytes.len() as i32);
    let content_stream = Stream::new(content_dict, content_bytes.to_vec());
    let content_id = doc.add_object(content_stream);

    // Page dictionary
    let page_dict = dictionary! {
        "Type" => "Page",
        "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
        "Contents" => content_id,
        "Resources" => resources,
    };
    let page_id = doc.add_object(page_dict);

    // Pages tree
    let pages_id = doc.add_object(dictionary! {
        "Type" => "Pages",
        "Count" => 1,
        "Kids" => vec![page_id.into()],
    });

    // Update page with parent reference
    let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
    page_obj.set("Parent", pages_id);
    doc.objects.insert(page_id, Object::Dictionary(page_obj));

    // Catalog
    let catalog_id = doc.add_object(dictionary! {
        "Type" => "Catalog",
        "Pages" => pages_id,
    });
    doc.trailer.set("Root", catalog_id);

    // Save PDF
    let pdf_path = dir.join("source.pdf");
    doc.save(&pdf_path)?;

    // Generate expected.json
    let expected = PageClassExpected {
        class: "Scanned".to_string(),
        confidence_min: 0.90,
        hybrid_cells: None,
    };
    let json_path = dir.join("expected.json");
    fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;

    println!(
        "  Created: {}/source.pdf ({:.2} KB)",
        dir.file_name().unwrap().to_string_lossy(),
        fs::metadata(&pdf_path)?.len() as f64 / 1024.0
    );

    Ok(())
}

/// Generate a BrokenVector PDF (invisible text + image)
fn generate_brokenvector_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
    use lopdf::{Dictionary, Document, Object, Stream};

    let mut doc = Document::with_version("1.5");

    // Create font
    let mut font_dict = Dictionary::new();
    font_dict.set("Type", "Font");
    font_dict.set("Subtype", "Type1");
    font_dict.set("BaseFont", "Helvetica");
    let font_id = doc.add_object(font_dict);

    // Create a 1x1 white pixel image
    let image_data = vec![255u8; 4];
    let image_stream = Stream::new(
        dictionary! {
            "Type" => "XObject",
            "Subtype" => "Image",
            "Width" => 1,
            "Height" => 1,
            "BitsPerComponent" => 8,
            "ColorSpace" => "DeviceRGB",
            "Length" => image_data.len() as i32,
        },
        image_data,
    );
    let image_id = doc.add_object(image_stream);

    // Resources
    let mut resources = Dictionary::new();
    let mut font_resources = Dictionary::new();
    font_resources.set("F1", font_id);
    resources.set("Font", font_resources);
    let mut xobject = Dictionary::new();
    xobject.set("Im1", image_id);
    resources.set("XObject", xobject);

    // Content stream: Invisible text (Tr=3) + full-page image
    // The text is there but invisible, simulating a bad OCR overlay
    let content_text = r#"
        BT /F1 12 Tf 50 750 Td 3 Tr
        (This text is invisible Tr=3 overlay over scanned image.) Tj
        0 -15 Td (It represents a broken vector PDF with bad OCR layer.) Tj
        0 -15 Td (Classification should detect this as BrokenVector.) Tj
        ET
        q 612 792 scale
        /Im1 Do
        Q
    "#;

    let content_bytes = content_text.as_bytes();
    let mut content_dict = Dictionary::new();
    content_dict.set("Length", content_bytes.len() as i32);
    let content_stream = Stream::new(content_dict, content_bytes.to_vec());
    let content_id = doc.add_object(content_stream);

    // Page dictionary
    let page_dict = dictionary! {
        "Type" => "Page",
        "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
        "Contents" => content_id,
        "Resources" => resources,
    };
    let page_id = doc.add_object(page_dict);

    // Pages tree
    let pages_id = doc.add_object(dictionary! {
        "Type" => "Pages",
        "Count" => 1,
        "Kids" => vec![page_id.into()],
    });

    // Update page with parent reference
    let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
    page_obj.set("Parent", pages_id);
    doc.objects.insert(page_id, Object::Dictionary(page_obj));

    // Catalog
    let catalog_id = doc.add_object(dictionary! {
        "Type" => "Catalog",
        "Pages" => pages_id,
    });
    doc.trailer.set("Root", catalog_id);

    // Save PDF
    let pdf_path = dir.join("source.pdf");
    doc.save(&pdf_path)?;

    // Generate expected.json
    let expected = PageClassExpected {
        class: "BrokenVector".to_string(),
        confidence_min: 0.90,
        hybrid_cells: None,
    };
    let json_path = dir.join("expected.json");
    fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;

    println!(
        "  Created: {}/source.pdf ({:.2} KB)",
        dir.file_name().unwrap().to_string_lossy(),
        fs::metadata(&pdf_path)?.len() as f64 / 1024.0
    );

    Ok(())
}

/// Generate a Hybrid PDF (text header + scanned body)
fn generate_hybrid_pdf(dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
    use lopdf::{Dictionary, Document, Object, Stream};

    let mut doc = Document::with_version("1.5");

    // Create font
    let mut font_dict = Dictionary::new();
    font_dict.set("Type", "Font");
    font_dict.set("Subtype", "Type1");
    font_dict.set("BaseFont", "Helvetica");
    let font_id = doc.add_object(font_dict);

    // Create a 1x1 white pixel image for the body
    let image_data = vec![255u8; 4];
    let image_stream = Stream::new(
        dictionary! {
            "Type" => "XObject",
            "Subtype" => "Image",
            "Width" => 1,
            "Height" => 1,
            "BitsPerComponent" => 8,
            "ColorSpace" => "DeviceRGB",
            "Length" => image_data.len() as i32,
        },
        image_data,
    );
    let image_id = doc.add_object(image_stream);

    // Resources
    let mut resources = Dictionary::new();
    let mut font_resources = Dictionary::new();
    font_resources.set("F1", font_id);
    resources.set("Font", font_resources);
    let mut xobject = Dictionary::new();
    xobject.set("Im1", image_id);
    resources.set("XObject", xobject);

    // Content stream: Text header (top 25%) + image body (bottom 75%)
    // Header: visible text in the top portion
    // Body: image covering the bottom portion
    let content_text = r#"
        BT /F1 14 Tf 50 750 Td
        (This is a HYBRID document with vector text header) Tj
        0 -20 Td (The header contains selectable text) Tj
        0 -20 Td (Below this header is a scanned image body) Tj
        ET
        q
        0 0 612 560 re  W n
        612 792 scale
        /Im1 Do
        Q
    "#;

    let content_bytes = content_text.as_bytes();
    let mut content_dict = Dictionary::new();
    content_dict.set("Length", content_bytes.len() as i32);
    let content_stream = Stream::new(content_dict, content_bytes.to_vec());
    let content_id = doc.add_object(content_stream);

    // Page dictionary
    let page_dict = dictionary! {
        "Type" => "Page",
        "MediaBox" => vec![0.0.into(), 0.0.into(), 612.0.into(), 792.0.into()],
        "Contents" => content_id,
        "Resources" => resources,
    };
    let page_id = doc.add_object(page_dict);

    // Pages tree
    let pages_id = doc.add_object(dictionary! {
        "Type" => "Pages",
        "Count" => 1,
        "Kids" => vec![page_id.into()],
    });

    // Update page with parent reference
    let mut page_obj = doc.get_object(page_id)?.as_dict().cloned()?;
    page_obj.set("Parent", pages_id);
    doc.objects.insert(page_id, Object::Dictionary(page_obj));

    // Catalog
    let catalog_id = doc.add_object(dictionary! {
        "Type" => "Catalog",
        "Pages" => pages_id,
    });
    doc.trailer.set("Root", catalog_id);

    // Save PDF
    let pdf_path = dir.join("source.pdf");
    doc.save(&pdf_path)?;

    // Generate expected.json
    // For hybrid, we expect specific hybrid_cells (bottom rows of the 8x8 grid)
    // The image covers bottom 75% of page, which corresponds to rows 2-7 (6 rows = 48 cells)
    let hybrid_cells: Vec<usize> = (16..64).collect(); // rows 2-7

    let expected = PageClassExpected {
        class: "Hybrid".to_string(),
        confidence_min: 0.15,
        hybrid_cells: Some(hybrid_cells),
    };
    let json_path = dir.join("expected.json");
    fs::write(&json_path, serde_json::to_string_pretty(&expected)?)?;

    println!(
        "  Created: {}/source.pdf ({:.2} KB)",
        dir.file_name().unwrap().to_string_lossy(),
        fs::metadata(&pdf_path)?.len() as f64 / 1024.0
    );

    Ok(())
}

/// Generate glyph shape database from font files.
///
/// This function walks a directory of font files (TrueType/OpenType),
/// rasterizes every mapped glyph at 32x32 via fontdue, computes pHash
/// for each, and writes the result as build/glyph-shapes.json.
///
/// # Arguments
///
/// * `fonts_dir` - Path to directory containing .ttf/.otf font files
/// * `output_path` - Path where glyph-shapes.json will be written
///
/// # Output format
///
/// JSON array of entries:
/// ```json
/// {
///   "phash_hex": "0123456789abcdef",
///   "char": "A",
///   "source_font": "LiberationSans-Regular.ttf",
///   "frequency_rank": 1
/// }
/// ```
fn gen_shape_db(fonts_dir: &str, output_path: &str) -> Result<(), Box<dyn std::error::Error>> {
    println!("==========================================");
    println!("Generating Glyph Shape Database");
    println!("==========================================");

    let workspace_root = find_workspace_root();
    let fonts_path = workspace_root.join(fonts_dir);
    let output_file = workspace_root.join(output_path);

    if !fonts_path.exists() {
        return Err(format!("Fonts directory not found: {}", fonts_path.display()).into());
    }

    // Create output directory
    if let Some(parent) = output_file.parent() {
        fs::create_dir_all(parent)?;
    }

    // Load character frequency data
    let frequency_data = load_frequency_data(&workspace_root)?;

    // Find all font files
    let font_files = find_font_files(&fonts_path)?;
    println!("\nFound {} font files:", font_files.len());
    for font_file in &font_files {
        println!("  - {}", font_file.file_name().unwrap().to_string_lossy());
    }

    // Process each font and collect glyphs
    let mut all_glyphs: Vec<GlyphEntry> = Vec::new();
    let mut seen_hashes: HashMap<(u64, char), String> = HashMap::new();
    let mut collisions: Vec<(String, String, u64)> = Vec::new();

    for font_file in &font_files {
        println!(
            "\nProcessing: {}",
            font_file.file_name().unwrap().to_string_lossy()
        );

        // Load the font
        let font_bytes = fs::read(font_file)?;
        let font = Font::from_bytes(font_bytes.as_slice(), fontdue::FontSettings::default())
            .map_err(|e| format!("Failed to load font: {}", e))?;

        let font_name = font_file.file_name().unwrap().to_string_lossy().to_string();
        let mut glyph_count = 0;

        // Rasterize glyphs for all Unicode codepoints
        // We'll iterate over common Unicode ranges
        for codepoint in 0..0x10000 {
            let ch = match std::char::from_u32(codepoint) {
                Some(c) if !c.is_control() && c != '\u{FFFD}' => c,
                _ => continue,
            };

            // Skip characters that are unlikely to be in fonts
            if should_skip_char(ch) {
                continue;
            }

            // Check if the font has this glyph
            if !has_glyph(&font, ch) {
                continue;
            }

            // Rasterize at 32px (scales to 32x32 bitmap)
            let (metrics, bitmap) = font.rasterize(ch, 32.0);

            // Skip empty glyphs (zero width/height)
            if bitmap.is_empty() || metrics.width == 0 || metrics.height == 0 {
                continue;
            }

            // Convert to centered 32x32 bitmap
            let centered = center_bitmap_32x32(&bitmap, metrics.width, metrics.height);

            // Compute pHash using pdftract-core's phash_glyph
            let phash = compute_phash(&centered);

            // Get frequency rank
            let freq_rank = frequency_data.get(&ch).copied().unwrap_or(0);

            // Check for collisions
            let key = (phash, ch);
            if let Some(_other_font) = seen_hashes.get(&key) {
                // Same (phash, char) pair from different font - keep first
                continue;
            }

            // Check for cross-character collisions (same hash, different char)
            let mut collision_replacement = None;
            let mut skip_new = false;

            // Collect collision info first (without modifying seen_hashes)
            for (&(existing_hash, existing_ch), other_font_name) in seen_hashes.iter() {
                if existing_hash == phash && existing_ch != ch {
                    // Different chars with same hash - keep higher frequency
                    let freq_existing = frequency_data.get(&existing_ch).copied().unwrap_or(0);
                    let freq_new = freq_rank;

                    if freq_new > freq_existing {
                        // New char has higher frequency, replace old
                        collision_replacement =
                            Some((existing_hash, existing_ch, other_font_name.clone()));
                    } else {
                        // Keep old, skip new
                        skip_new = true;
                        collisions.push((font_name.clone(), other_font_name.clone(), phash));
                    }
                }
            }

            // Handle collision replacement if needed
            if let Some((existing_hash, existing_ch, _)) = collision_replacement {
                all_glyphs.retain(|g| !(g.phash == existing_hash && g.ch == existing_ch));
                seen_hashes.remove(&(existing_hash, existing_ch));
            }

            if skip_new {
                continue;
            }

            seen_hashes.insert(key, font_name.clone());
            all_glyphs.push(GlyphEntry {
                phash_hex: format!("{:016x}", phash),
                phash,
                ch,
                source_font: font_name.clone(),
                frequency_rank: freq_rank,
            });

            glyph_count += 1;
        }

        println!("  Rasterized {} glyphs", glyph_count);
    }

    // Sort by pHash ascending
    all_glyphs.sort_by(|a, b| a.phash_hex.cmp(&b.phash_hex));

    // Write output
    let json_output = serde_json::to_string_pretty(&all_glyphs)?;
    fs::write(&output_file, json_output)?;

    println!("\n==========================================");
    println!("Shape Database Generation Complete");
    println!("==========================================");
    println!("\nOutput: {}", output_file.display());
    println!("Total glyphs: {}", all_glyphs.len());
    if !collisions.is_empty() {
        println!("Hash collisions: {}", collisions.len());
        for (font1, font2, hash) in collisions.iter().take(10) {
            println!("  - {} vs {} (hash: {:016x})", font1, font2, hash);
        }
    }

    Ok(())
}

/// Entry in the glyph shape database.
#[derive(Debug, Serialize, Deserialize)]
struct GlyphEntry {
    /// Perceptual hash as hexadecimal string
    phash_hex: String,
    /// Perceptual hash as u64 for comparison
    #[serde(skip)]
    phash: u64,
    /// Unicode character (escaped if needed)
    #[serde(rename = "char")]
    ch: char,
    /// Source font filename
    source_font: String,
    /// Unicode frequency rank (higher = more common)
    frequency_rank: u32,
}

/// Check if a font has a glyph for the given character.
fn has_glyph(font: &Font, ch: char) -> bool {
    // fontdue provides indices for characters
    // If the character maps to a valid glyph index, the font has it
    let index = font.lookup_glyph_index(ch);
    index != 0
}

/// Skip characters that are unlikely to be in fonts or are control characters.
fn should_skip_char(ch: char) -> bool {
    // Skip control characters, private use, surrogates
    if ch.is_control() {
        return true;
    }

    let cp = ch as u32;

    // Private Use Areas
    if (0xE000..=0xF8FF).contains(&cp)
        || (0xF0000..=0xFFFFD).contains(&cp)
        || (0x100000..=0x10FFFD).contains(&cp)
    {
        return true;
    }

    // Surrogates
    if (0xD800..=0xDFFF).contains(&cp) {
        return true;
    }

    // Very high Unicode planes are unlikely to be in fonts
    if cp > 0x2FFFF {
        return true;
    }

    false
}

/// Center a glyph bitmap into a 32x32 canvas.
///
/// The input bitmap is centered both horizontally and vertically,
/// with zero padding.
fn center_bitmap_32x32(bitmap: &[u8], width: usize, height: usize) -> [u8; 1024] {
    let mut centered = [0u8; 1024];

    if width == 0 || height == 0 || bitmap.is_empty() {
        return centered;
    }

    // Calculate offsets to center the bitmap
    let x_offset = (32 - width) / 2;
    let y_offset = (32 - height) / 2;

    // Copy bitmap into centered position
    for y in 0..height.min(32) {
        for x in 0..width.min(32) {
            let src_idx = y * width + x;
            if src_idx < bitmap.len() {
                let dst_y = y_offset + y;
                let dst_x = x_offset + x;
                if dst_y < 32 && dst_x < 32 {
                    let dst_idx = dst_y * 32 + dst_x;
                    centered[dst_idx] = bitmap[src_idx];
                }
            }
        }
    }

    centered
}

/// Compute pHash for a 32x32 grayscale bitmap.
///
/// This is a wrapper around pdftract-core's phash_glyph function.
fn compute_phash(bitmap: &[u8; 1024]) -> u64 {
    // For now, we'll compute a simple hash
    // In the future, we'd use pdftract-core::font::shape::phash_glyph
    // but that's not accessible from xtask due to dependency direction

    // Simple DCT-based pHash implementation
    // TODO: Integrate with pdftract-core's phash_glyph once accessible
    simple_phash(bitmap)
}

/// Simple pHash implementation for xtask.
///
/// This is a fallback until we can properly integrate with pdftract-core's phash.
fn simple_phash(bitmap: &[u8; 1024]) -> u64 {
    // Convert to centered floats
    let mut input = [0.0f32; 1024];
    for i in 0..1024 {
        input[i] = (bitmap[i] as f32) / 127.5 - 1.0;
    }

    // Apply 2D DCT
    let mut dct_output = [0.0f32; 1024];
    simple_dct_2d(&input, &mut dct_output);

    // Extract 8x8 low-frequency coefficients
    let mut low_freq = [0.0f32; 64];
    let mut idx = 0;
    for y in 0..8 {
        for x in 0..8 {
            if x == 0 && y == 0 {
                low_freq[idx] = dct_output[8].abs(); // Skip DC, use [0,8]
            } else {
                low_freq[idx] = dct_output[y * 32 + x].abs();
            }
            idx += 1;
        }
    }

    // Compute median
    let mut sorted = low_freq;
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let median = (sorted[31] + sorted[32]) / 2.0;

    // Threshold to 64-bit hash
    let mut hash: u64 = 0;
    for (i, &val) in low_freq.iter().enumerate() {
        if val > median {
            hash |= 1 << i;
        }
    }

    hash
}

/// Simple 2D DCT-II implementation.
fn simple_dct_2d(input: &[f32; 1024], output: &mut [f32; 1024]) {
    let mut temp = [0.0f32; 1024];

    // Precompute cosine basis
    let mut basis = [[0.0f32; 32]; 32];
    for (k, row) in basis.iter_mut().enumerate() {
        for (n, val) in row.iter_mut().enumerate() {
            *val = (std::f32::consts::PI * k as f32 * (2 * n + 1) as f32 / 64.0).cos();
        }
    }

    // Row-wise DCT
    for y in 0..32 {
        for k in 0..32 {
            let mut sum = 0.0f32;
            for n in 0..32 {
                sum += input[y * 32 + n] * basis[k][n];
            }
            let scale: f32 = if k == 0 {
                (1.0_f32 / 32.0_f32).sqrt()
            } else {
                (2.0_f32 / 32.0_f32).sqrt()
            };
            temp[y * 32 + k] = sum * scale;
        }
    }

    // Column-wise DCT
    for x in 0..32 {
        for k in 0..32 {
            let mut sum = 0.0f32;
            for n in 0..32 {
                sum += temp[n * 32 + x] * basis[k][n];
            }
            let scale: f32 = if k == 0 {
                (1.0_f32 / 32.0_f32).sqrt()
            } else {
                (2.0_f32 / 32.0_f32).sqrt()
            };
            output[k * 32 + x] = sum * scale;
        }
    }
}

/// Load character frequency data.
///
/// Returns a map from character to frequency rank (higher = more common).
fn load_frequency_data(
    workspace_root: &Path,
) -> Result<HashMap<char, u32>, Box<dyn std::error::Error>> {
    let frequency_path = workspace_root.join("build").join("frequency.json");

    // If frequency file doesn't exist, return empty map
    if !frequency_path.exists() {
        println!(
            "Warning: frequency.json not found at {}",
            frequency_path.display()
        );
        println!("Using zero frequency rank for all characters.");
        return Ok(HashMap::new());
    }

    let content = fs::read_to_string(&frequency_path)?;
    let data: serde_json::Value = serde_json::from_str(&content)?;

    let mut frequency = HashMap::new();

    // Parse frequency data
    // Expected format: {"A": 1, "B": 2, ...} or array of objects
    if let Some(obj) = data.as_object() {
        for (key, value) in obj {
            if let Some(rank) = value.as_u64() {
                if let Some(ch) = key.chars().next() {
                    frequency.insert(ch, rank as u32);
                }
            }
        }
    }

    println!("Loaded frequency data for {} characters", frequency.len());
    Ok(frequency)
}

/// Find all font files in a directory.
fn find_font_files(dir: &Path) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
    let mut font_files = Vec::new();

    for entry in fs::read_dir(dir)? {
        let entry = entry?;
        let path = entry.path();

        if path.is_dir() {
            // Recursively search subdirectories
            font_files.extend(find_font_files(&path)?);
        } else {
            let ext = path.extension().and_then(|s| s.to_str());
            if ext == Some("ttf") || ext == Some("otf") {
                font_files.push(path);
            }
        }
    }

    font_files.sort();
    Ok(font_files)
}

/// Expected page classification for a fixture
#[derive(Debug, Serialize)]
struct PageClassExpected {
    /// Expected class name (Vector, Scanned, Hybrid, BrokenVector)
    class: String,
    /// Minimum confidence threshold (actual confidence may vary slightly)
    confidence_min: f32,
    /// For Hybrid pages: expected scanned cell indexes
    hybrid_cells: Option<Vec<usize>>,
}