use anyhow::{Context, Result}; use clap::{Parser, Subcommand, ArgAction}; use std::collections::HashMap; use std::fs; use std::io::Write; use std::path::PathBuf; mod cache_cmd; mod classify; mod codegen; mod doctor; mod grep; mod hash; mod header; mod inspect; mod mcp; mod migrate; mod middleware; mod output; mod pages; mod panic_hook; mod password; mod profiles_cmd; mod serve; mod url; mod validate; mod verify_receipt; use codegen::Language; use output::OutputConfig; use pdftract_core::atomic_file_writer::AtomicFileWriter; use pdftract_core::cache; use pdftract_core::extract::{extract_pdf, result_to_json}; use pdftract_core::markdown::{block_to_markdown, page_to_markdown, page_to_markdown_with_links, MarkdownOptions}; use pdftract_core::options::{ExtractionOptions, ReceiptsMode}; // Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG}; #[derive(Parser)] #[command(name = "pdftract")] #[command(about = "pdftract CLI - PDF extraction and conformance testing", long_about = None)] struct Cli { #[command(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { /// List all diagnostic codes with their metadata ListDiagnostics, /// Explain a specific diagnostic code in detail ExplainDiagnostic { /// Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB) code: String, }, /// Compare actual results against expected values with tolerances (for conformance testing) Compare { /// Path to the actual results JSON actual: PathBuf, /// Path to the expected results JSON expected: PathBuf, /// Path to the tolerances JSON (optional) #[arg(short, long)] tolerances: Option, /// Output format (text, json) #[arg(short, long, default_value = "text")] format: String, }, /// Run SDK conformance test suite Conformance { /// Path to the conformance suite JSON #[arg(short, long, default_value = "tests/sdk-conformance/cases.json")] suite: PathBuf, /// SDK name #[arg(short, long, default_value = "pdftract")] sdk: String, /// SDK version #[arg(short, long, default_value = "0.1.0")] version: String, /// Output report path #[arg(short, long, default_value = "conformance-report.json")] output: PathBuf, }, /// SDK code generation commands Sdk { #[command(subcommand)] sdk_command: SdkCommands, }, /// Extract text and structure from a PDF file Extract { /// Path to the PDF file (use '-' for stdin) input: PathBuf, /// Read password from stdin (one line, terminated by newline) #[arg(long, conflicts_with = "password")] password_stdin: bool, /// PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1) #[arg(long, conflicts_with = "password_stdin")] password: Option, /// Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE) #[arg(long, value_name = "HEADER:VALUE", action = ArgAction::Append)] header: Vec, /// Page range to extract (1-based, comma-separated: 1-5,7,12-) #[arg(long, value_name = "RANGE")] pages: Option, /// Output JSON to PATH (use '-' for stdout) #[arg(long, value_name = "PATH")] json: Vec, /// Output Markdown to PATH (use '-' for stdout) #[arg(long, value_name = "PATH")] md: Vec, /// Output plain text to PATH (use '-' for stdout) #[arg(long, value_name = "PATH")] text: Vec, /// Output NDJSON to stdout (mutually exclusive with other formats) #[arg(long, conflicts_with_all = ["json", "md", "text", "format"])] ndjson: bool, /// Output formats (comma-separated: json,markdown,text,ndjson) #[arg(long, value_delimiter = ',', value_name = "FORMATS")] format: Vec, /// Base path for auto-named outputs (used with --format) #[arg(short, long, value_name = "BASE")] output: Option, /// Receipt mode: off (default), lite, or svg #[arg(long, value_name = "MODE", default_value = "off", value_parser = ["off", "lite", "svg"])] receipts: String, /// Enable OCR for scanned pages (requires 'ocr' feature) #[arg(long)] ocr: bool, /// OCR language codes (comma-separated, e.g., 'eng,fra,deu') #[arg(long, value_delimiter = ',')] ocr_language: Vec, /// Enable cache at this directory (creates if absent) #[arg(long, value_name = "DIR")] cache_dir: Option, /// Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes) #[arg(long, value_name = "SIZE", default_value = "1 GiB")] cache_size: String, /// Disable cache for this extraction (even if --cache-dir is set) #[arg(long)] no_cache: bool, /// Emit HTML comment anchors before each block in Markdown output #[arg(long)] md_anchors: bool, /// Auto-detect document type and apply appropriate profile #[arg(long)] auto: bool, /// Force-apply a specific profile (by name or YAML file path) #[arg(long, value_name = "NAME|PATH")] profile: Option, /// Include header blocks in output #[arg(long)] include_headers: bool, /// Include footer blocks in output #[arg(long)] include_footers: bool, /// Include both header and footer blocks in output #[arg(long)] include_headers_footers: bool, /// Include invisible text spans in output (rendering_mode == 3) #[arg(long)] include_invisible_text: bool, /// Include hidden-layer text spans in output (OCG-controlled) #[arg(long)] include_hidden_layers: bool, /// Include watermark blocks in output (no-op until Phase 7) #[arg(long)] include_watermarks: bool, }, /// Classify document type (runs metadata + signal extraction, not full text extraction) Classify { /// Path to the PDF file input: PathBuf, /// Read password from stdin (one line, terminated by newline) #[arg(long, conflicts_with = "password")] password_stdin: bool, /// PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1) #[arg(long, conflicts_with = "password_stdin")] password: Option, /// Directory containing custom profile YAML files #[arg(long, value_name = "DIR")] profiles: Option, /// Pretty-print JSON output #[arg(long)] pretty: bool, /// Number of top reasons to include (default: all) #[arg(long, default_value = "0")] top_k: usize, /// Exit with code 1 if document type is unknown #[arg(long)] exit_on_unknown: bool, }, /// Search for text patterns in PDF files with bounding-box results #[cfg(feature = "grep")] Grep(grep::GrepArgs), /// Inspect a PDF file in a local web browser with debugging overlays Inspect(inspect::InspectArgs), /// Verify a receipt against a PDF file VerifyReceipt(verify_receipt::VerifyReceiptCommand), /// Compute the PDF structural fingerprint (hash) Hash { /// Path to the PDF file or URL input: String, /// PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1) #[arg(long)] password: Option, /// Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE) #[arg(long, value_name = "HEADER:VALUE", action = ArgAction::Append)] header: Vec, }, /// Manage the extraction cache Cache { #[command(subcommand)] cache_command: CacheCommands, }, /// Manage document type profiles Profiles { #[command(subcommand)] profiles_command: ProfilesCommands, }, /// Start the HTTP server for extraction /// /// ## Security Model /// /// **pdftract serve has no built-in authentication.** Deploy behind a reverse proxy /// (nginx, Traefik, Caddy) for production use. The server accepts PDFs via multipart /// upload only; no endpoint accepts file paths from server filesystem. /// /// ## Concurrency /// /// The server uses a two-level concurrency architecture: /// /// - **tokio**: Per-request concurrency via the async executor. Each HTTP request /// is handled asynchronously on tokio's multi-threaded runtime. /// - **rayon**: Per-document parallelism within each extraction. PDF pages are /// processed in parallel using rayon's work-stealing thread pool. /// /// The bridge between async (tokio) and sync (rayon) is `tokio::task::spawn_blocking`. /// Each POST handler wraps the synchronous extraction call in `spawn_blocking`, which /// runs the work on tokio's blocking thread pool (separate from the async reactor). /// /// This design ensures: /// - The async reactor is never blocked by extraction work /// - Multiple PDFs can be extracted concurrently (one per request) /// - Within each PDF, pages are processed in parallel (rayon) /// - Thread pools are sized appropriately (tokio: 512 blocking threads; rayon: num_cpus) /// /// ## Endpoints /// /// - `POST /extract` - Extract PDF and return JSON with metadata /// - `POST /extract/text` - Extract PDF and return plain text /// - `POST /extract/stream` - Extract PDF and return streaming NDJSON /// - `GET /health` - Health check (responds within 100ms even during concurrent extractions) /// /// ## Cache /// /// Cache is optional. When enabled, extracted results are stored on disk and reused /// for identical PDFs. Cache status is reported via the `X-Pdftract-Cache` response header. Serve { /// Bind address (e.g., "127.0.0.1:8080", "[::1]:9000", "0.0.0.0:3000") #[arg(short, long, default_value = "127.0.0.1:8080")] bind: String, /// Enable cache at this directory #[arg(long, value_name = "DIR")] cache_dir: Option, /// Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes) #[arg(long, value_name = "SIZE", default_value = "1 GiB")] cache_size: String, /// Disable cache #[arg(long)] no_cache: bool, /// Maximum request body size in MB (default: 256, max: 4096) #[arg(long, default_value = "256")] max_upload_mb: usize, /// Maximum decompression size in GB (default: 1, overrides per-request max_decompress_gb) #[arg(long, value_name = "GB", default_value = "1")] max_decompress_gb: usize, /// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr) /// /// Rotation: pdftract does NOT rotate logs; configure logrotate on the audit-log file. /// When FILE is "-", rotation is the responsibility of the supervisor (e.g., journald). #[arg(long, value_name = "FILE")] audit_log: Option, /// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy) #[arg(long)] trust_forwarded_for: bool, /// Directory containing custom profile YAML files (repeatable) #[arg(long, value_name = "DIR")] profile_dir: Option, /// Enable hot-reload for profiles (re-read directory on every request) #[arg(long)] profile_hot_reload: bool, }, /// Start the MCP (Model Context Protocol) server /// /// Per ADR-006: stdio and HTTP transports are mutually exclusive because they have /// opposite stdout discipline (stdio: JSON-RPC sink; HTTP: log channel). Exactly one /// transport must be selected per invocation. Mcp { /// Use stdio transport (for Claude Desktop, Claude Code, Continue, Cursor) /// /// This is the default transport mode if neither --stdio nor --bind is specified. #[arg(long, conflicts_with = "bind")] stdio: bool, /// Bind address for the MCP server (e.g., "127.0.0.1:8080", "[::1]:9000", "0.0.0.0:3000") /// /// Enables HTTP+SSE transport mode. Mutually exclusive with --stdio. #[arg(short, long, value_name = "ADDR", conflicts_with = "stdio")] bind: Option, /// Path to a file containing the bearer token (RECOMMENDED) #[arg(long, conflicts_with = "auth_token")] auth_token_file: Option, /// Bearer token for authentication (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_TOKEN=1) #[arg(long, conflicts_with = "auth_token_file")] auth_token: Option, /// Maximum request body size in MB (default: 256) #[arg(long, default_value = "256")] max_upload_mb: usize, /// Root directory for local filesystem access (enforces path-traversal protection) /// /// When set, all local-path tool arguments are resolved relative to DIR and any /// path that escapes DIR is rejected with JSON-RPC error code -32602. /// HTTPS URLs are not affected by this flag. Without --root, the server runs in /// trust-the-caller mode (no path-check applied). #[arg(long, value_name = "DIR")] root: Option, /// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr) /// /// Rotation: pdftract does NOT rotate logs; configure logrotate on the audit-log file. /// When FILE is "-", rotation is the responsibility of the supervisor (e.g., journald). #[arg(long, value_name = "FILE")] audit_log: Option, }, /// Validate a JSON file against the pdftract schema Validate { /// Path to the JSON file to validate (use '-' for stdin) file: String, /// Path to a custom schema file (default: bundled v1.0 schema) #[arg(short, long, value_name = "PATH")] schema: Option, /// Quiet mode - suppress error output (only exit code matters) #[arg(short, long)] quiet: bool, }, /// Migrate JSON output between schema versions MigrateSchema { /// Source schema version (e.g., "1.0", "1.1") #[arg(long)] from: String, /// Target schema version (e.g., "1.0", "1.1") #[arg(long)] to: String, /// Input JSON file (use '-' for stdin) #[arg(default_value = "-")] input: String, /// Output JSON file (use '-' for stdout) #[arg(short, long, default_value = "-")] output: String, /// Pretty-print output JSON #[arg(short, long)] pretty: bool, }, /// Check environment health and dependencies /// /// Exit code policy: exits 0 if no checks FAIL (WARN does not affect exit code); /// exits 1 if any check FAILs; exits 2 on argument parse errors. Doctor { /// Print compiled features and exit #[arg(long)] features: bool, /// Output results as JSON #[arg(long)] json: bool, /// Disable colored output #[arg(long)] no_color: bool, /// Explicit form of the default policy (exit 1 if any check FAILs). /// /// This flag is the default behavior and is provided for CI script /// readability. WARN does not affect exit code regardless of this flag. #[arg(long)] exit_on_fail: bool, /// Verify the profile search path includes DIR #[arg(long, value_name = "DIR")] profile_dir: Option, /// Verify DIR is writable and has sufficient space #[arg(long, value_name = "DIR")] cache_dir: Option, /// Requested OCR languages (default: eng) #[arg(long, value_delimiter = ',')] lang: Vec, }, } #[derive(Subcommand)] enum SdkCommands { /// Generate SDK skeleton from templates Codegen { /// Target language #[arg(short, long)] lang: Language, /// Output directory #[arg(short, long)] out: PathBuf, /// Version string (defaults to current pdftract version) #[arg(short, long, default_value = "0.1.0")] version: String, }, /// Validate existing SDK against current generator output Validate { /// Target language #[arg(short, long)] lang: Language, /// Path to existing SDK directory #[arg(short, long)] sdk_dir: PathBuf, }, } #[derive(Subcommand)] enum CacheCommands { /// Show cache statistics Stats { /// Path to the cache directory dir: PathBuf, /// Output in JSON format #[arg(long)] json: bool, }, /// Clear all cache entries (preserves index.json and sentinel) Clear { /// Path to the cache directory dir: PathBuf, /// Skip confirmation prompt #[arg(short, long)] yes: bool, }, /// Purge old cache entries Purge { /// Path to the cache directory dir: PathBuf, /// Delete entries older than this duration (e.g., "30d", "7d", "1h") #[arg(long, value_name = "DURATION")] older_than: Option, /// Delete entries matching this version constraint (e.g., "<1.0.0") #[arg(long, value_name = "CONSTRAINT")] version: Option, }, } #[derive(Subcommand)] enum ProfilesCommands { /// List all available profiles List, /// Show a profile's YAML content Show { /// Profile name or path to YAML file name_or_path: String, }, /// Export a built-in profile to stdout Export { /// Name of the built-in profile to export name: String, }, /// Install a profile to the user config directory Install { /// Path to the profile YAML file to install path: PathBuf, }, /// Validate a profile file Validate { /// Path to the profile YAML file to validate path: PathBuf, }, } fn main() -> Result<()> { // Install panic hook for SecretString redaction in backtraces // This ensures credentials never leak in crash dumps panic_hook::install_panic_hook(); let cli = Cli::parse(); match cli.command { Commands::ListDiagnostics => { cmd_list_diagnostics()?; } Commands::ExplainDiagnostic { code } => { cmd_explain_diagnostic(&code)?; } Commands::Compare { actual, expected, tolerances, format, } => { cmd_compare(actual, expected, tolerances, &format)?; } Commands::Conformance { suite, sdk, version, output, } => { cmd_conformance(suite, &sdk, &version, output)?; } Commands::Sdk { sdk_command } => { cmd_sdk(sdk_command)?; } Commands::Extract { input, password_stdin, password, header, pages, json, md, text, ndjson, format, receipts, ocr, ocr_language, cache_dir, cache_size, no_cache, md_anchors, auto, profile, output, include_headers, include_footers, include_headers_footers, include_invisible_text, include_hidden_layers, include_watermarks, } => { if let Err(e) = cmd_extract( input, password_stdin, password, header, pages, json.into_iter().collect(), md.into_iter().collect(), text.into_iter().collect(), ndjson, format, output, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache, md_anchors, auto, profile, include_headers, include_footers, include_headers_footers, include_invisible_text, include_hidden_layers, include_watermarks, ) { let error_msg = e.to_string(); eprintln!("Error: {}", error_msg); // Exit code 3 for encryption errors (per spec) if error_msg.contains("decryption failed") || error_msg.contains("PDF decryption failed") || error_msg.contains("Unsupported encryption") || error_msg.contains("Wrong password") { std::process::exit(3); } std::process::exit(1); } } Commands::Classify { input, password_stdin, password, profiles, pretty, top_k, exit_on_unknown, } => { if let Err(e) = cmd_classify( input, password_stdin, password, profiles, pretty, top_k, exit_on_unknown, ) { let error_msg = e.to_string(); eprintln!("Error: {}", error_msg); // Exit code 3 for encryption errors (per spec) if error_msg.contains("decryption failed") || error_msg.contains("PDF decryption failed") || error_msg.contains("Unsupported encryption") || error_msg.contains("Wrong password") { std::process::exit(3); } std::process::exit(1); } } #[cfg(feature = "grep")] Commands::Grep(args) => { if let Err(e) = grep::run_grep(args) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Inspect(args) => { if let Err(e) = cmd_inspect(args) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Cache { cache_command } => { if let Err(e) = cmd_cache(cache_command) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Profiles { profiles_command } => { if let Err(e) = cmd_profiles(profiles_command) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Serve { bind, cache_dir, cache_size, no_cache, max_upload_mb, max_decompress_gb, audit_log, trust_forwarded_for, profile_dir, profile_hot_reload, } => { if let Err(e) = cmd_serve( bind, cache_dir, &cache_size, no_cache, max_upload_mb, max_decompress_gb, audit_log, trust_forwarded_for, ) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::VerifyReceipt(cmd) => { if let Err(e) = verify_receipt::run_verify_receipt(cmd) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Hash { input, password, header, } => { // Parse and validate custom HTTP headers let headers = if !header.is_empty() { match header::parse_headers(&header) { Ok(h) => { // Check if input is a URL (https:// or http://) if input.starts_with("http://") || input.starts_with("https://") { // Convert HashMap to Vec for HashArgs h.into_iter().collect() } else { // Local file: headers don't apply Vec::new() } } Err(e) => { eprintln!("Error: {}", e); std::process::exit(2); } } } else { Vec::new() }; let args = hash::HashArgs { input, password, headers, }; if let Err(e) = hash::run_hash(args) { let exit_code = hash::map_error_to_exit_code(&e); eprintln!("Error: {}", e); std::process::exit(exit_code); } } Commands::Mcp { stdio, bind, auth_token_file, auth_token, max_upload_mb, root, audit_log, } => { // Per ADR-006: exactly one transport must be selected. // If neither --stdio nor --bind is specified, default to stdio mode. let use_stdio = stdio || bind.is_none(); // Validate and canonicalize the root directory if provided let root_path = match root { Some(ref root_arg) => match mcp::canonicalize_root(root_arg) { Ok(canonical) => Some(canonical), Err(e) => { eprintln!("Error: {}", e); std::process::exit(1); } }, None => None, }; // Report root configuration if let Some(ref root) = root_path { eprintln!( "Root directory: {} (path-traversal protection enabled)", root.display() ); } else { eprintln!("No root directory (trust-the-caller mode)"); } if use_stdio { // stdio mode (default for Claude Desktop, Claude Code, etc.) if let Err(e) = mcp::run_stdio(root_path.as_deref(), audit_log.as_deref()) { eprintln!("Error: {}", e); std::process::exit(1); } } else { // HTTP mode (--bind was specified) let bind_addr = bind.expect("--bind is Some when use_stdio is false"); if let Err(e) = mcp::run( bind_addr, auth_token_file, auth_token, Some(max_upload_mb), root_path, audit_log, ) { eprintln!("Error: {}", e); std::process::exit(1); } } } Commands::Validate { file, schema, quiet, } => { if let Err(e) = validate::run_validate(validate::ValidateArgs { file, schema_path: schema, quiet, }) { // Validation failed - exit 1 (error already printed by run_validate unless quiet) if !quiet { eprintln!("Error: {}", e); } std::process::exit(1); } } Commands::MigrateSchema { from, to, input, output, pretty, } => { if let Err(e) = migrate::run_migration(&from, &to, &input, &output, pretty) { eprintln!("Error: {}", e); std::process::exit(1); } } Commands::Doctor { features, json, no_color, exit_on_fail, profile_dir, cache_dir, lang, } => { if let Err(e) = doctor::run(doctor::DoctorOptions { features, json, no_color, exit_on_fail, profile_dir, cache_dir, lang, }) { eprintln!("Error: {}", e); std::process::exit(1); } } } Ok(()) } fn cmd_extract( input: PathBuf, password_stdin: bool, password: Option, header: Vec, pages: Option, json: Vec, md: Vec, text: Vec, ndjson: bool, format: Vec, output: Option, receipts: &str, ocr: bool, ocr_language: Vec, cache_dir: Option, cache_size: &str, no_cache: bool, md_anchors: bool, auto: bool, profile: Option, include_headers: bool, include_footers: bool, include_headers_footers: bool, include_invisible_text: bool, include_hidden_layers: bool, include_watermarks: bool, ) -> Result<()> { // Validate receipts mode let receipts_mode = match ReceiptsMode::from_str(receipts) { Ok(mode) => mode, Err(e) => { eprintln!("Error: {}", e); std::process::exit(2); } }; // Validate output configuration let output_config = OutputConfig { json, md, text, ndjson, format_list: format.clone(), output_base: output.clone(), }; let output_specs = match output_config.build_specs() { Ok(specs) => specs, Err(e) => { eprintln!("Error: {}", e); std::process::exit(2); } }; // Report what outputs will be produced if output_specs.len() > 1 { eprintln!("Producing {} outputs:", output_specs.len()); for spec in &output_specs { let dest_name = match &spec.dest { output::Destination::Stdout => "stdout".to_string(), output::Destination::File(p) => p.display().to_string(), }; eprintln!(" {} -> {}", spec.format.name(), dest_name); } } // Check if SVG mode is requested but feature is not available if receipts_mode == ReceiptsMode::SvgClip { #[cfg(not(feature = "receipts"))] { eprintln!("Error: --receipts=svg requires the 'receipts' feature to be enabled"); eprintln!("Build pdftract with: --features receipts"); std::process::exit(2); } } // Check if OCR is requested but feature is not available if ocr { #[cfg(not(feature = "ocr"))] { eprintln!("Error: --ocr requires the 'ocr' feature to be enabled"); eprintln!("Build pdftract with: --features ocr"); std::process::exit(2); } } // Resolve password using the priority order defined in TH-07 let resolved_password = match password::resolve_password(password_stdin, password) { Ok(pwd) => pwd, Err(e) => { eprintln!("Error: {}", e); std::process::exit(password::EXIT_USAGE_ERROR as i32); } }; // Report password status (never the value itself) if resolved_password.is_some() { eprintln!("Password provided via secure channel"); } // Check if input is a URL let input_str = input.to_string_lossy().to_string(); let is_url = input_str.starts_with("http://") || input_str.starts_with("https://"); // Parse and validate custom HTTP headers let custom_headers = if !header.is_empty() { match header::parse_headers(&header) { Ok(h) => { if is_url { eprintln!("Custom HTTP headers: {}", h.len()); h } else { // Local file: headers don't apply, but we don't error std::collections::HashMap::new() } } Err(e) => { eprintln!("Error: {}", e); std::process::exit(2); } } } else { std::collections::HashMap::new() }; // Parse URL credentials if present let (url_for_source, parsed_url) = if is_url { match url::parse_url(&input_str) { Ok(parsed) => { if parsed.has_credentials { eprintln!("Warning: URL contains credentials that are visible in shell history."); eprintln!("Consider using --header 'Authorization: Bearer TOKEN' instead."); } (parsed.url.clone(), Some(parsed)) } Err(e) => { eprintln!("Error parsing URL: {}", e); std::process::exit(2); } } } else { (input_str.clone(), None) }; // Build extraction options let mut options = ExtractionOptions::with_receipts(receipts_mode); // Configure password options.password = resolved_password; // Configure page range options.pages = pages; // Configure output filtering options options.output.include_headers = include_headers || include_headers_footers; options.output.include_footers = include_footers || include_headers_footers; options.output.include_invisible = include_invisible_text; options.output.include_hidden_layers = include_hidden_layers; options.output.include_watermarks = include_watermarks; // Handle --auto flag: run classifier first #[cfg(feature = "profiles")] if auto { eprintln!("Auto-detecting document type..."); use pdftract_core::profiles::{ classify_and_select_profile, extract_signals_from_results, load_extraction_profiles, apply_extraction_tuning, apply_profile_to_metadata, }; // Load all extraction profiles let profiles = load_extraction_profiles(&[]).unwrap_or_default(); if !profiles.is_empty() { // Perform a lightweight extraction for classification let classify_options = ExtractionOptions::default(); if let Ok(classify_result) = extract_pdf(&input, &classify_options) { let has_signature_field = !classify_result.signatures.is_empty(); let has_form_field = !classify_result.form_fields.is_empty(); let page_data: Vec<(Vec<_>, Vec<_>)> = classify_result .pages .iter() .map(|p| (p.blocks.clone(), p.spans.clone())) .collect(); let selected_profile = classify_and_select_profile( &profiles.iter().map(|p| p.profile.clone()).collect::>(), &page_data, has_signature_field, has_form_field, ); if let Some((profile, match_result)) = selected_profile { eprintln!( "Document type: {} (confidence: {:.2})", profile.name, match_result.confidence ); // Apply profile extraction tuning if let Some(ref tuning) = profile.extraction { apply_extraction_tuning(tuning, &mut options); } // Store the selected profile for later field extraction // We'll extract fields after the main extraction // For now, just log the match reasons for reason in match_result.reasons.iter().take(5) { eprintln!(" - {}", reason); } } else { eprintln!("Document type: unknown (confidence: below threshold)"); eprintln!("Proceeding with default extraction options."); } } else { eprintln!( "Warning: Classification failed. Proceeding with default extraction options." ); } } else { eprintln!( "Warning: No profiles available. Proceeding with default extraction options." ); } } // Handle --profile flag: load and apply specific profile #[cfg(feature = "profiles")] if let Some(ref profile_name_or_path) = profile { use pdftract_core::profiles::{ load_extraction_profiles, apply_extraction_tuning, }; eprintln!("Applying profile: {}", profile_name_or_path); let profiles = load_extraction_profiles(&[]).unwrap_or_default(); // Find the profile by name or load from path let profile = if std::path::PathBuf::from(profile_name_or_path).exists() { // Load from file path use pdftract_core::profiles::load_profile_file; match load_profile_file(&std::path::PathBuf::from(profile_name_or_path)) { Ok(p) => Some(p), Err(e) => { eprintln!("Error loading profile: {}", e); std::process::exit(1); } } } else { // Find by name profiles.iter() .find(|p| p.profile.name == *profile_name_or_path) .map(|p| p.profile.clone()) }; if let Some(p) = profile { eprintln!("Loaded profile: {}", p.name); if let Some(ref tuning) = p.extraction { apply_extraction_tuning(tuning, &mut options); } } else { eprintln!("Error: Profile '{}' not found", profile_name_or_path); std::process::exit(1); } } #[cfg(not(feature = "profiles"))] if auto { eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled."); eprintln!("Build pdftract with: --features profiles"); eprintln!("Proceeding with default extraction options."); } #[cfg(not(feature = "profiles"))] if profile.is_some() { eprintln!("Warning: --profile flag requires the 'profiles' feature to be enabled."); eprintln!("Build pdftract with: --features profiles"); eprintln!("Proceeding with default extraction options."); } // Set markdown anchors option options.markdown_anchors = md_anchors; if md_anchors { eprintln!("Markdown anchors enabled"); } // Set OCR language if specified if !ocr_language.is_empty() { options.ocr_language = ocr_language; eprintln!("OCR languages: {}", options.ocr_language.join("+")); } else if ocr { // OCR enabled but no language specified, use default (eng) eprintln!("OCR enabled with default language: eng"); } // Create cache directory if specified let cache_dir_ref = if let Some(ref dir) = cache_dir { if !no_cache { if !dir.exists() { fs::create_dir_all(dir).context(format!( "Failed to create cache directory: {}", dir.display() ))?; } // Initialize cache index if it doesn't exist if cache::layout::index_path(dir).exists() { Some(dir.as_path()) } else { // Create initial index let _ = cache::layout::save_index(dir, &cache::layout::CacheIndex::default()); Some(dir.as_path()) } } else { None } } else { None }; // Parse cache size let cache_size_bytes = if cache_dir_ref.is_some() { Some(parse_size(cache_size)?) } else { None }; // Perform extraction (with different paths for URLs vs local files) let (mut result, cache_status, cache_age) = if is_url { // Remote extraction path #[cfg(not(feature = "remote"))] { eprintln!("Error: Remote sources require the 'remote' feature to be enabled"); eprintln!("Build pdftract with: --features remote"); std::process::exit(2); } #[cfg(feature = "remote")] { use pdftract_core::source::{HttpRangeSource, open_source}; // Combine custom headers with URL credentials let mut headers_vec: Vec<(String, String)> = custom_headers .into_iter() .map(|(k, v)| (k, v)) .collect(); // If URL has credentials, ureq will automatically add Authorization header // We just pass the URL with credentials to HttpRangeSource let extraction_url = if let Some(ref parsed) = parsed_url { // If credentials were present, use the original URL (with credentials stripped) // ureq will handle the basic auth from the URL parsed.url.clone() } else { url_for_source.clone() }; // Add custom headers to the URL // Note: ureq automatically handles basic auth when credentials are in the URL let source = HttpRangeSource::with_headers(&extraction_url, headers_vec) .context("Failed to open remote PDF source")?; use pdftract_core::extract::{ExtractionSource, extract_pdf_from_source}; let extraction_source = ExtractionSource::Remote(Box::new(source)); let result = extract_pdf_from_source(extraction_source, &options) .context("Failed to extract PDF from remote source")?; (result, "skipped".to_string(), None) // Cache not applicable for remote } } else { // Local file extraction path (with cache) cache::extract_with_cache(&input, &options, cache_dir_ref, no_cache, cache_size_bytes) .context("Failed to extract PDF")? }; // Set cache status metadata result.metadata.cache_status = Some(cache_status); result.metadata.cache_age_seconds = cache_age; // Extract profile fields if --auto or --profile was used #[cfg(feature = "profiles")] { use pdftract_core::profiles::{ load_extraction_profiles, apply_profile_to_metadata, }; let profile_to_apply = if auto { // Re-run classification to get the selected profile let profiles = load_extraction_profiles(&[]).unwrap_or_default(); let page_data: Vec<(Vec<_>, Vec<_>)> = result .pages .iter() .map(|p| (p.blocks.clone(), p.spans.clone())) .collect(); let has_signature_field = !result.signatures.is_empty(); let has_form_field = !result.form_fields.is_empty(); use pdftract_core::profiles::classify_and_select_profile; classify_and_select_profile( &profiles.iter().map(|p| p.profile.clone()).collect::>(), &page_data, has_signature_field, has_form_field, ).map(|(p, _)| p) } else if profile.is_some() { // Load the specified profile let profile_name_or_path = profile.as_ref().unwrap(); let profiles = load_extraction_profiles(&[]).unwrap_or_default(); if std::path::PathBuf::from(profile_name_or_path).exists() { use pdftract_core::profiles::load_profile_file; load_profile_file(&std::path::PathBuf::from(profile_name_or_path)).ok() } else { profiles.iter() .find(|p| p.profile.name == *profile_name_or_path) .map(|p| p.profile.clone()) } } else { None }; // Apply profile to metadata if let Some(p) = profile_to_apply { let (name, version, fields) = apply_profile_to_metadata(&p, &result.pages); // Update the result's metadata with profile information result.metadata.profile_name = Some(name); result.metadata.profile_version = Some(version); result.metadata.profile_fields = fields; } } // Write each output to its destination for spec in &output_specs { match spec.dest { output::Destination::Stdout => { // Write to stdout write_output(&result, &options, spec.format, &mut std::io::stdout())?; } output::Destination::File(ref path) => { // Create atomic file writer for file output let mut writer = AtomicFileWriter::create(path) .context(format!("Failed to create output file writer: {}", path.display()))?; write_output(&result, &options, spec.format, &mut writer)?; writer.commit().context(format!("Failed to commit output file: {}", path.display()))?; } } } Ok(()) } /// Write output in the specified format to the given writer. fn write_output( result: &pdftract_core::ExtractionResult, options: &ExtractionOptions, format: output::Format, writer: &mut W, ) -> Result<()> { use std::io::Write; match format { output::Format::Json => { let json_output = result_to_json(result); let json_str = serde_json::to_string_pretty(&json_output)?; writeln!(writer, "{}", json_str)?; } output::Format::Text => { // Plain text output: concatenate all span texts for page in &result.pages { for span in &page.spans { writeln!(writer, "{}", span.text)?; } } } output::Format::Markdown => { // Markdown output: simple conversion with optional anchors let include_anchors = options.markdown_anchors; let include_page_breaks = true; // Add --- between pages for (page_idx, page) in result.pages.iter().enumerate() { let is_last_page = page_idx == result.pages.len() - 1; let include_break = include_page_breaks && !is_last_page; // Filter links to only those belonging to this page let page_links: Vec<_> = result.links.iter() .filter(|link| link.page_index == page_idx) .cloned() .collect(); // Use markdown module with inline link support (Phase 6.5.5b) let md_options = MarkdownOptions { include_headers_footers: options.output.include_headers || options.output.include_footers, include_watermarks: options.output.include_watermarks, include_page_breaks: include_break, }; let md = page_to_markdown_with_links( &page.blocks, &page.spans, &page.tables, &page_links, page.index, include_anchors, &md_options, ); write!(writer, "{}", md)?; } // Emit signatures footer if any signatures exist if !result.signatures.is_empty() { writeln!(writer, "\n## Signatures\n")?; for sig in &result.signatures { writeln!(writer, "- **{}**: {}", sig.field_name, sig.signer_name)?; if let Some(date) = &sig.signing_date { writeln!(writer, " - Date: {}", date)?; } if let Some(reason) = &sig.reason { writeln!(writer, " - Reason: {}", reason)?; } if let Some(location) = &sig.location { writeln!(writer, " - Location: {}", location)?; } if let Some(sub_filter) = &sig.sub_filter { writeln!(writer, " - Format: {}", sub_filter)?; } writeln!(writer, " - Validation Status: {}", sig.validation_status)?; } } } output::Format::Ndjson => { // NDJSON output: emit one line per block with spans for page in &result.pages { for (block_idx, block) in page.blocks.iter().enumerate() { let ndjson_record = serde_json::json!({ "page": page.index, "block_index": block_idx, "kind": block.kind, "bbox": block.bbox, "spans": block.spans.iter().filter_map(|&span_idx| { page.spans.get(span_idx).map(|span| { serde_json::json!({ "text": span.text, "font": span.font, "size": span.size, "bbox": span.bbox, }) }) }).collect::>(), }); writeln!(writer, "{}", ndjson_record)?; } } } } Ok(()) } fn cmd_classify( input: PathBuf, password_stdin: bool, password: Option, profiles_dir: Option, pretty: bool, top_k: usize, exit_on_unknown: bool, ) -> Result<()> { // Resolve password using the priority order defined in TH-07 let resolved_password = match password::resolve_password(password_stdin, password) { Ok(pwd) => pwd, Err(e) => { eprintln!("Error: {}", e); std::process::exit(password::EXIT_USAGE_ERROR as i32); } }; // Report password status (never the value itself) if resolved_password.is_some() { eprintln!("Password provided via secure channel"); } // Run classification let args = classify::ClassifyArgs { input, profiles_dir, pretty, top_k, exit_on_unknown, }; let output = classify::run_classify(args)?; // Print JSON output let json_str = classify::format_json(&output, pretty); println!("{}", json_str); Ok(()) } fn cmd_list_diagnostics() -> Result<()> { println!("pdftract Diagnostic Codes"); println!(); println!("This catalog lists all diagnostic codes emitted during PDF parsing and extraction."); println!("Each diagnostic includes a severity level, recoverable flag, phase origin, and suggested action."); println!(); // Group by category let mut categories: std::collections::HashMap<&str, Vec<&DiagInfo>> = std::collections::HashMap::new(); for info in DIAGNOSTIC_CATALOG { categories.entry(info.category).or_default().push(info); } // Define category order let category_order = vec![ "STRUCT", "XREF", "STREAM", "ENCRYPTION", "PAGE", "FONT", "OCR", "REMOTE", "GSTATE", "LAYOUT", "MCP", "CACHE", ]; for category in category_order { if let Some(infos) = categories.get(category) { println!("=== {}_* codes ===", category); println!(); for info in infos { println!("{} ({})", info.code, info.severity); println!(" Phase: {}", info.phase); println!( " Recoverable: {}", if info.recoverable { "Yes" } else { "No" } ); println!(" Action: {}", info.suggested_action); println!(); } } } println!("Total: {} diagnostic codes", DIAGNOSTIC_CATALOG.len()); Ok(()) } fn cmd_explain_diagnostic(code: &str) -> Result<()> { // Normalize the input code (handle case-insensitivity and strip whitespace) let code_upper = code.to_uppercase().trim().to_string(); // Try to find the diagnostic by name in the catalog let info = DIAGNOSTIC_CATALOG .iter() .find(|info| info.code.name() == code_upper) .ok_or_else(|| anyhow::anyhow!("Unknown diagnostic code: {}", code))?; println!("Diagnostic: {}", info.code); println!("Category: {}", info.category); println!("Severity: {}", info.severity); println!( "Recoverable: {}", if info.recoverable { "Yes" } else { "No" } ); println!("Phase Origin: {}", info.phase); println!(); println!("Description:"); // Get the description from the DiagCode's doc comment // We can't access doc comments at runtime, but we can provide useful info match info.code { DiagCode::StructInvalidName => { println!(" Invalid name character or malformed name object"); println!(" Names containing invalid characters or exceeding the 127-byte limit are truncated."); } DiagCode::StructInvalidHex => { println!(" Invalid hexadecimal character in hex string or name escape"); println!(" Non-hex characters in <...> strings or #XX escapes are skipped."); } DiagCode::StructInvalidOctal => { println!(" Invalid octal escape sequence in literal string"); println!(" Invalid \\NNN escapes are passed through literally."); } DiagCode::StructInvalidStreamHeader => { println!(" Invalid stream header"); println!(" The 'stream' keyword must be followed by CRLF or LF per PDF spec."); } DiagCode::StructUnexpectedByte => { println!(" Unexpected byte during parsing"); println!(" A byte doesn't match expected token syntax; lexer resynchronizes."); } DiagCode::StructUnexpectedEof => { println!(" Unexpected end of file"); println!(" The file ends mid-token; parsing continues with partial data."); } DiagCode::StructUnterminatedString => { println!(" Unterminated literal string"); println!(" A literal string is missing a closing parenthesis."); } DiagCode::StructMissingKey => { println!(" Missing required dictionary key"); println!(" A required key is absent from a dictionary."); } DiagCode::StructCircularRef => { println!(" Circular reference detected"); println!(" An indirect reference forms a cycle (A → B → A)."); } DiagCode::StructXobjectCycle => { println!(" Form XObject cycle detected"); println!(" A form XObject invokes itself directly or indirectly."); } DiagCode::StructDepthExceeded => { println!(" Dictionary nesting depth exceeds limit"); println!(" Structure is too deeply nested; truncated to prevent stack overflow."); } DiagCode::StructInvalidDictValue => { println!(" Invalid dictionary value"); println!(" A dictionary key is not followed by a value."); } DiagCode::StructInvalidDictKey => { println!(" Invalid dictionary key"); println!(" A dictionary key is not a name object."); } DiagCode::StructInvalidIndirectHeader => { println!(" Invalid indirect object header"); println!(" The 'N G obj' header is malformed."); } DiagCode::StructIntegerOverflow => { println!(" Integer overflow during parsing"); println!(" An integer would overflow i64; value is clamped."); } DiagCode::StructInvalidObjstm => { println!(" Invalid object stream format"); println!(" An object stream has a malformed header or invalid data."); } DiagCode::StructInvalidGeometry => { println!(" Invalid geometry value"); println!(" NaN or Inf in MediaBox/CropBox/Rotate; canonicalized to 0."); } DiagCode::StructInvalidUtf16 => { println!(" Invalid UTF-16BE encoding"); println!(" A UTF-16BE string has odd length or invalid encoding."); } DiagCode::StructUnresolvedDestination => { println!(" Unresolved named destination"); println!(" An outline references a named destination (not yet resolved)."); } DiagCode::StructNonGotoOutline => { println!(" Non-GoTo action in outline"); println!(" An outline has an action other than GoTo/URI."); } DiagCode::StructInvalidPdfDocEncoding => { println!(" Invalid PDFDocEncoding"); println!(" A PDFDocEncoding string cannot be decoded to UTF-8."); } DiagCode::StructHybridConflict => { println!(" Hybrid xref conflict"); println!(" Traditional xref and stream disagree on object state."); } DiagCode::StructInvalidPrevOffset => { println!(" Invalid /Prev offset in xref chain"); println!(" A trailer's /Prev offset points to invalid data."); } DiagCode::XrefInvalidHeader => { println!(" Invalid xref keyword or header"); println!(" The xref table doesn't start with the 'xref' keyword."); } DiagCode::XrefInvalidEntry => { println!(" Malformed xref entry"); println!(" An xref entry doesn't match the 20-byte format."); } DiagCode::XrefInvalidSubsectionHeader => { println!(" Invalid subsection header"); println!(" An xref subsection header is malformed."); } DiagCode::XrefObjectZeroNotFree => { println!(" Object 0 is not free"); println!(" Object 0 is marked as in-use, violating PDF spec."); } DiagCode::XrefTrailerNotFound => { println!(" Trailer dictionary not found"); println!(" The trailer dictionary couldn't be located or parsed."); } DiagCode::XrefTruncated => { println!(" Truncated xref table"); println!(" The xref table ends unexpectedly."); } DiagCode::XrefRepaired => { println!(" Xref was reconstructed"); println!(" Forward scan recovered xref entries after primary strategies failed."); } DiagCode::XrefLinearizedNoForwardScan => { println!(" Forward scan disabled for linearized PDF"); println!(" Forward scan would incorrectly find the partial first-page xref."); } DiagCode::XrefRemoteNoForwardScan => { println!(" Forward scan disabled for remote sources"); println!(" Forward scan would require fetching the entire file."); } DiagCode::XrefInvalidStreamFormat => { println!(" Invalid xref stream format"); println!(" An xref stream has a malformed header or invalid /W array."); } DiagCode::XrefInvalidStreamEntry => { println!(" Invalid xref stream entry"); println!(" An xref stream entry cannot be parsed due to invalid data."); } DiagCode::StreamDecodeError => { println!(" Stream decompression failed"); println!(" A stream decoder encountered corrupt data mid-decompression."); } DiagCode::StreamBomb => { println!(" Decompression bomb limit exceeded"); println!(" A stream's decompressed size would exceed the safety limit."); } DiagCode::StreamUnknownFilter => { println!(" Unknown filter name"); println!(" A stream specifies an unsupported filter."); } DiagCode::StreamInvalidParams => { println!(" Invalid filter parameters"); println!(" A stream's /DecodeParms dictionary is malformed."); } DiagCode::EncryptionUnsupported => { println!(" Unsupported encryption or no password"); println!( " PDF is encrypted and no password was supplied or algorithm is unsupported." ); } DiagCode::EncryptionWrongPassword => { println!(" Password incorrect"); println!(" The supplied password doesn't match the PDF's encryption key."); } DiagCode::PageOutOfRange => { println!(" Page number out of range"); println!(" --pages specifies a page number greater than the document's page count."); } DiagCode::PageInvalidCount => { println!(" Invalid page count"); println!(" The /Count key in the /Pages tree is invalid."); } DiagCode::PageInvalidRotate => { println!(" Invalid /Rotate value"); println!(" A page's /Rotate value is not a multiple of 90."); } DiagCode::FontGlyphUnmapped => { println!(" Glyph could not be mapped to Unicode"); println!( " A glyph has no entry in /ToUnicode CMap, AGL, fingerprint, or shape match." ); } DiagCode::FontNotFound => { println!(" Font not found or couldn't be parsed"); println!(" A referenced font is missing from the PDF or couldn't be parsed."); } DiagCode::FontInvalidCmap => { println!(" Invalid CMap format"); println!(" A CMap stream is malformed."); } DiagCode::OcrJbig2Unsupported => { println!(" JBIG2 decoder not available"); println!(" Build with --features full-render to enable JBIG2 decoding."); } DiagCode::OcrJpxUnsupported => { println!(" JPEG2000 decoder not available"); println!(" Build with --features full-render or install libopenjp2."); } DiagCode::OcrCcittUnsupported => { println!(" CCITT fax decoder not available"); println!(" Install libtiff system library or build with --features full-render."); } DiagCode::OcrTesseractFailed => { println!(" Tesseract OCR failed"); println!(" Tesseract crashed or returned an error."); } DiagCode::OcrBrokenVectorUnavailable => { println!(" OCR unavailable on broken-vector page"); println!(" Build with --features ocr to enable OCR recovery."); } DiagCode::RemoteFetchInterrupted => { println!(" HTTP fetch interrupted or failed"); println!(" Network error, timeout, or server error occurred."); } DiagCode::RemoteNoRangeSupport => { println!(" Server does not support Range requests"); println!(" Falls back to downloading the entire file."); } DiagCode::RemoteTlsFailed => { println!(" TLS handshake failed"); println!(" The TLS handshake failed; check the server's certificate."); } DiagCode::RemoteDnsFailed => { println!(" DNS resolution failed"); println!(" The hostname could not be resolved."); } DiagCode::GstateStackOverflow => { println!(" Graphics state stack overflow"); println!(" The graphics state stack exceeded the internal limit."); } DiagCode::GstateStackUnderflow => { println!(" Graphics state stack underflow"); println!(" More Q operators than q operators in the content stream."); } DiagCode::GstateBtEtMismatch => { println!(" Mismatched BT/ET pair"); println!(" The content stream has mismatched BT/ET operators."); } DiagCode::CmArgCount => { println!(" Invalid argument count for cm operator"); println!(" The cm operator requires exactly 6 numeric arguments."); } DiagCode::CmDegenerate => { println!(" Degenerate matrix"); println!(" The cm operator received a degenerate matrix (det=0 or NaN); clamped to identity."); } DiagCode::LayoutTaggedPdfDeferred => { println!(" Tagged PDF StructTree deferred"); println!(" StructTree is ignored; XY-cut is used instead (Phase 7.1 pending)."); } DiagCode::LayoutReadingOrderAmbiguous => { println!(" Reading order may be incorrect"); println!(" The reading order algorithm detected ambiguity."); } DiagCode::LayoutLowReadability => { println!(" Low readability score"); println!(" Page readability is below 0.85; may indicate mojibake."); } DiagCode::McpToolInvalidParams => { println!(" MCP tool call has invalid parameters"); println!(" An MCP tool call doesn't match the tool's schema."); } DiagCode::McpPathTraversal => { println!(" MCP path traversal attempt"); println!(" An MCP path escapes the --root directory."); } DiagCode::CacheEntryCorrupt => { println!(" Cache entry is corrupted"); println!(" A cached entry failed to deserialize and was deleted."); } DiagCode::CacheWriteFailed => { println!(" Cache write failed"); println!(" Writing to the cache failed (e.g., out of disk space)."); } DiagCode::StructInvalidType => { println!(" Invalid object type"); println!(" An object is not the expected type (e.g., expecting a stream but finding a dictionary)."); } DiagCode::StructIncompleteCoverage => { println!(" StructTree coverage below threshold"); println!(" StructTree coverage is below 80% with /Suspects true, triggering XY-cut fallback."); } DiagCode::FontParseFailed => { println!(" Font parsing failed"); println!(" A font file could not be parsed."); } DiagCode::FontUnsupported => { println!(" Unsupported font type"); println!(" A font uses an unsupported format or encoding."); } DiagCode::FontCidtogidmapTruncated => { println!(" CIDToGIDMap truncated"); println!(" A CIDToGIDMap stream is incomplete."); } _ => { println!(" (See diagnostic code)"); } } println!(); println!("Suggested Action: {}", info.suggested_action); println!(); println!("Phase Origin: {}", info.phase); Ok(()) } fn cmd_compare( actual: PathBuf, expected: PathBuf, tolerances: Option, format: &str, ) -> Result<()> { let actual_json = fs::read_to_string(&actual) .context(format!("Failed to read actual results from {:?}", actual))?; let actual_val: serde_json::Value = serde_json::from_str(&actual_json).context("Failed to parse actual results as JSON")?; let expected_json = fs::read_to_string(&expected).context(format!( "Failed to read expected results from {:?}", expected ))?; let expected_val: serde_json::Value = serde_json::from_str(&expected_json).context("Failed to parse expected results as JSON")?; let tolerances_val = if let Some(tol_path) = tolerances { let tol_json = fs::read_to_string(&tol_path) .context(format!("Failed to read tolerances from {:?}", tol_path))?; Some( serde_json::from_str::(&tol_json) .context("Failed to parse tolerances as JSON")?, ) } else { None }; let result = compare_values(&actual_val, &expected_val, tolerances_val.as_ref())?; match format { "json" => { let output = serde_json::to_string_pretty(&result)?; println!("{}", output); } _ => { print_compare_result(&result); } } Ok(()) } fn cmd_sdk(command: SdkCommands) -> Result<()> { match command { SdkCommands::Codegen { lang, out, version } => { let template_dir = PathBuf::from("templates/sdk-skeleton"); let mut generator = codegen::CodeGenerator::new(&template_dir, version)?; generator.generate(lang, &out)?; println!("\nSDK generated successfully to: {}", out.display()); } SdkCommands::Validate { lang, sdk_dir } => { let template_dir = PathBuf::from("templates/sdk-skeleton"); let mut generator = codegen::CodeGenerator::new(&template_dir, "0.1.0".to_string())?; let result = generator.validate(lang, &sdk_dir)?; if result.differences.is_empty() { println!("SDK is up to date with current generator output."); } else { println!("Found {} differences:", result.differences.len()); for diff in &result.differences { match diff.kind { codegen::DifferenceKind::MissingInSdk => { println!(" MISSING: {}", diff.path); } codegen::DifferenceKind::ExtraInSdk => { println!(" EXTRA: {}", diff.path); } codegen::DifferenceKind::ContentDiff => { println!(" MODIFIED: {}", diff.path); } } } std::process::exit(1); } } } Ok(()) } fn cmd_conformance(suite: PathBuf, sdk: &str, version: &str, output: PathBuf) -> Result<()> { println!("Running conformance suite: {:?}", suite); println!("SDK: {} v{}", sdk, version); println!("Output: {:?}", output); let suite_json = fs::read_to_string(&suite).context(format!("Failed to read suite from {:?}", suite))?; let suite_val: serde_json::Value = serde_json::from_str(&suite_json).context("Failed to parse suite as JSON")?; let cases = suite_val .get("cases") .and_then(|v| v.as_array()) .context("Suite missing 'cases' array")?; println!("\nFound {} test cases", cases.len()); // This is a stub - actual implementation would invoke the SDK let results: Vec = cases .iter() .map(|case| { serde_json::json!({ "id": case.get("id").unwrap_or(&serde_json::json!("unknown")), "status": "skip", "error": "SDK conformance runner not yet implemented - use language-specific runner" }) }) .collect(); let report = serde_json::json!({ "sdk": sdk, "sdk_version": version, "suite_version": suite_val.get("version").unwrap_or(&serde_json::json!("unknown")), "timestamp": chrono::Utc::now().to_rfc3339(), "results": results, "summary": { "total": results.len(), "passed": 0, "failed": 0, "skipped": results.len(), "errors": 0 } }); fs::write(&output, serde_json::to_string_pretty(&report)?) .context(format!("Failed to write report to {:?}", output))?; println!("\nReport written to {:?}", output); Ok(()) } fn cmd_cache(command: CacheCommands) -> Result<()> { match command { CacheCommands::Stats { dir, json } => { let stats = cache_cmd::compute_stats(&dir)?; if json { cache_cmd::display_stats_json(&stats)?; } else { cache_cmd::display_stats(&stats); } } CacheCommands::Clear { dir, yes } => { cache_cmd::clear_cache(&dir, yes)?; } CacheCommands::Purge { dir, older_than, version, } => { if older_than.is_none() && version.is_none() { eprintln!("Error: --older-than or --version is required for purge"); eprintln!("Usage: pdftract cache purge DIR --older-than 30d"); eprintln!(" pdftract cache purge DIR --version '<1.0.0'"); std::process::exit(2); } if let Some(duration) = older_than { cache_cmd::purge_cache_older_than(&dir, &duration)?; } if let Some(constraint) = version { cache_cmd::purge_cache_version(&dir, &constraint)?; } } } Ok(()) } fn cmd_profiles(command: ProfilesCommands) -> Result<()> { use profiles_cmd::{ProfilesArgs, ProfilesCommand}; // Convert ProfilesCommands to profiles_cmd::ProfilesCommand let profiles_command = match command { ProfilesCommands::List => ProfilesCommand::List, ProfilesCommands::Show { name_or_path } => ProfilesCommand::Show { name_or_path }, ProfilesCommands::Export { name } => ProfilesCommand::Export { name }, ProfilesCommands::Install { path } => ProfilesCommand::Install { path }, ProfilesCommands::Validate { path } => ProfilesCommand::Validate { path }, }; let args = ProfilesArgs { command: profiles_command, }; profiles_cmd::run_profiles(args) } fn cmd_serve( bind: String, cache_dir: Option, cache_size: &str, no_cache: bool, max_upload_mb: usize, max_decompress_gb: usize, audit_log: Option, trust_forwarded_for: bool, ) -> Result<()> { // Warn if binding to 0.0.0.0 (no auth, exposed to all interfaces) if bind.starts_with("0.0.0.0") || bind.starts_with("[::]") { eprintln!("*** WARNING: Binding to {} exposes pdftract serve on ALL interfaces.", bind); eprintln!("*** pdftract serve has NO BUILT-IN AUTHENTICATION."); eprintln!("*** Deploy behind a reverse proxy (nginx, Traefik, Caddy) for production use."); eprintln!(); } // Validate hard cap for max_upload_mb (4 GiB) const MAX_UPLOAD_MB_HARD_CAP: usize = 4096; if max_upload_mb > MAX_UPLOAD_MB_HARD_CAP { anyhow::bail!( "--max-upload-mb value {} exceeds hard cap of {} MB (4 GiB). \ This limit prevents integer overflow when computing the byte limit.", max_upload_mb, MAX_UPLOAD_MB_HARD_CAP ); } // Parse cache size let cache_size_bytes = parse_size(cache_size)?; // Create cache directory if specified if let Some(ref dir) = cache_dir { if !dir.exists() { fs::create_dir_all(dir).context(format!( "Failed to create cache directory: {}", dir.display() ))?; } } // Run the HTTP server tokio::runtime::Runtime::new() .context("Failed to create tokio runtime")? .block_on(serve::run( bind, cache_dir, cache_size_bytes, no_cache, max_upload_mb, max_decompress_gb, audit_log, trust_forwarded_for, )) } /// Wrapper for the inspect subcommand. /// /// Creates a tokio runtime and runs the async inspect::run function. fn cmd_inspect(args: inspect::InspectArgs) -> Result<()> { tokio::runtime::Runtime::new() .context("Failed to create tokio runtime")? .block_on(inspect::run(args)) } /// Parse a size string like "1 GiB", "500 MiB", "2 GiB" into bytes. fn parse_size(size_str: &str) -> Result { let s = size_str.trim().to_lowercase(); let multiplier = if s.ends_with("gib") || s.ends_with("gb") || s.ends_with("g") { 1024 * 1024 * 1024 } else if s.ends_with("mib") || s.ends_with("mb") || s.ends_with("m") { 1024 * 1024 } else if s.ends_with("kib") || s.ends_with("kb") || s.ends_with("k") { 1024 } else { 1 // bytes }; let num_str = s .trim_end_matches("gib") .trim_end_matches("gb") .trim_end_matches("g") .trim_end_matches("mib") .trim_end_matches("mb") .trim_end_matches("m") .trim_end_matches("kib") .trim_end_matches("kb") .trim_end_matches("k") .trim() .replace('_', ""); let num: f64 = num_str .parse() .context(format!("Invalid size value: {}", size_str))?; Ok((num * multiplier as f64) as u64) } #[derive(Debug, serde::Serialize)] enum CompareResult { Pass, Fail { reason: String }, Missing, } fn compare_values( actual: &serde_json::Value, expected: &serde_json::Value, tolerances: Option<&serde_json::Value>, ) -> Result> { let mut results = std::collections::HashMap::new(); compare_recursive(actual, expected, tolerances, "", &mut results); Ok(results) } fn compare_recursive( actual: &serde_json::Value, expected: &serde_json::Value, tolerances: Option<&serde_json::Value>, path: &str, results: &mut std::collections::HashMap, ) { match (actual, expected) { // Handle min/max constraints (serde_json::Value::Number(act), serde_json::Value::Object(exp)) => { if let Some(min) = exp.get("min").and_then(|v| v.as_i64()) { if act.as_i64().map_or(true, |v| v < min) { results.insert( path.to_string(), CompareResult::Fail { reason: format!("value {} is less than minimum {}", act, min), }, ); return; } } if let Some(max) = exp.get("max").and_then(|v| v.as_i64()) { if act.as_i64().map_or(true, |v| v > max) { results.insert( path.to_string(), CompareResult::Fail { reason: format!("value {} is greater than maximum {}", act, max), }, ); return; } } if let Some(val) = exp.get("value") { let tol = find_tolerance(tolerances, path); let result = compare_with_tolerance(act, val, tol); results.insert(path.to_string(), result); } else { results.insert(path.to_string(), CompareResult::Pass); } } // String constraints (serde_json::Value::String(act), serde_json::Value::Object(exp)) => { if let Some(min_len) = exp .get("min_length") .and_then(|v| v.as_u64()) .map(|v| v as usize) { if act.len() < min_len { results.insert( path.to_string(), CompareResult::Fail { reason: format!( "string length {} is less than minimum {}", act.len(), min_len ), }, ); return; } } if let Some(containers) = exp.get("contains").and_then(|v| v.as_array()) { for substring in containers { if let Some(s) = substring.as_str() { if !act.contains(s) { results.insert( path.to_string(), CompareResult::Fail { reason: format!("string does not contain '{}'", s), }, ); return; } } } } results.insert(path.to_string(), CompareResult::Pass); } // Array length constraints (serde_json::Value::Array(act), serde_json::Value::Object(exp)) => { if let Some(min_len) = exp.get("min").and_then(|v| v.as_u64()).map(|v| v as usize) { if act.len() < min_len { results.insert( path.to_string(), CompareResult::Fail { reason: format!( "array length {} is less than minimum {}", act.len(), min_len ), }, ); return; } } if let Some(max_len) = exp.get("max").and_then(|v| v.as_u64()).map(|v| v as usize) { if act.len() > max_len { results.insert( path.to_string(), CompareResult::Fail { reason: format!( "array length {} is greater than maximum {}", act.len(), max_len ), }, ); return; } } results.insert(path.to_string(), CompareResult::Pass); } // Direct comparison (a, e) => { if a == e { results.insert(path.to_string(), CompareResult::Pass); } else { results.insert( path.to_string(), CompareResult::Fail { reason: format!("expected {:?}, got {:?}", e, a), }, ); } } } } fn compare_with_tolerance( actual: &serde_json::Number, expected: &serde_json::Value, tolerance: Option<&serde_json::Value>, ) -> CompareResult { let act_val = actual.as_f64().unwrap(); let exp_val = match expected { serde_json::Value::Number(n) => n.as_f64().unwrap(), _ => { return CompareResult::Fail { reason: "expected value is not a number".to_string(), } } }; if let Some(tol) = tolerance { if let Some(obj) = tol.as_object() { if let Some(abs_tol) = obj.get("abs").and_then(|v| v.as_f64()) { let diff = (act_val - exp_val).abs(); if diff <= abs_tol { return CompareResult::Pass; } } if let Some(rel_tol) = obj.get("rel").and_then(|v| v.as_f64()) { let diff = (act_val - exp_val).abs(); let avg = (act_val + exp_val) / 2.0; if avg > 0.0 && diff / avg <= rel_tol { return CompareResult::Pass; } } } } // Direct comparison if (act_val - exp_val).abs() < f64::EPSILON { CompareResult::Pass } else { CompareResult::Fail { reason: format!("numeric mismatch: {} vs {}", act_val, exp_val), } } } fn find_tolerance<'a>( tolerances: Option<&'a serde_json::Value>, path: &str, ) -> Option<&'a serde_json::Value> { let tol = tolerances?; if let Some(obj) = tol.as_object() { // Try exact path match if let Some(val) = obj.get(path) { return Some(val); } // Try wildcard patterns for (key, val) in obj { if key.contains('*') { let pattern = key.replace('*', ".*"); if let Ok(re) = regex::Regex::new(&pattern) { if re.is_match(path) { return Some(val); } } } } } None } fn print_compare_result(results: &std::collections::HashMap) { let mut passed = 0; let mut failed = 0; for (path, result) in results { match result { CompareResult::Pass => { passed += 1; } CompareResult::Fail { reason } => { failed += 1; eprintln!("FAIL [{}]: {}", path, reason); } CompareResult::Missing => { failed += 1; eprintln!("MISSING [{}]: value not found in actual", path); } } } println!("\nComparison complete:"); println!(" Passed: {}", passed); println!(" Failed: {}", failed); if failed > 0 { std::process::exit(1); } }