From ad29d9dadc3ee9ea939550f882fc6aae75c7a0b9 Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 8 Jun 2026 16:00:28 -0400 Subject: [PATCH] fix(pdftract-1j0f8): prevent newline accumulation in CLI reference generator The gen-cli-reference binary was accumulating extra blank lines after the marker on each regeneration because it preserved all content after the marker (including leading whitespace) and then added its own newlines. Fix: Trim leading whitespace from hand-curated content before appending. Also regenerated cli-reference.md to remove accumulated blank lines. Closes pdftract-1j0f8 --- .../src/bin/generate-cli-reference.rs | 3 +- docs/user-docs/src/cli-reference.md | 2 - xtask/src/bin/gen_cli_reference.rs | 1191 +---------------- 3 files changed, 47 insertions(+), 1149 deletions(-) diff --git a/crates/pdftract-cli/src/bin/generate-cli-reference.rs b/crates/pdftract-cli/src/bin/generate-cli-reference.rs index c1d686f..95f0392 100644 --- a/crates/pdftract-cli/src/bin/generate-cli-reference.rs +++ b/crates/pdftract-cli/src/bin/generate-cli-reference.rs @@ -54,7 +54,8 @@ fn main() -> Result<(), Box> { let hand_curated_content = if output_path.exists() { let existing = fs::read_to_string(&output_path)?; if let Some(idx) = existing.find(AUTOGEN_END_MARKER) { - Some(existing[idx + AUTOGEN_END_MARKER.len()..].to_string()) + // Trim leading whitespace from curated content to prevent newline accumulation + Some(existing[idx + AUTOGEN_END_MARKER.len()..].trim_start().to_string()) } else { None } diff --git a/docs/user-docs/src/cli-reference.md b/docs/user-docs/src/cli-reference.md index d537a32..847a539 100644 --- a/docs/user-docs/src/cli-reference.md +++ b/docs/user-docs/src/cli-reference.md @@ -612,8 +612,6 @@ Exit code policy: exits 0 if no checks FAIL (WARN does not affect exit code); ex - - ## Hand-Curated Content > **Note:** Any content added after this marker will be preserved diff --git a/xtask/src/bin/gen_cli_reference.rs b/xtask/src/bin/gen_cli_reference.rs index 217792f..5f1038f 100644 --- a/xtask/src/bin/gen_cli_reference.rs +++ b/xtask/src/bin/gen_cli_reference.rs @@ -11,14 +11,52 @@ use std::path::PathBuf; const AUTOGEN_END_MARKER: &str = ""; fn main() -> Result<(), Box> { + // Parse CLI arguments + let args: Vec = std::env::args().collect(); + let mut output_path: Option = None; + + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--output" | "-o" => { + if i + 1 < args.len() { + output_path = Some(PathBuf::from(&args[i + 1])); + i += 2; + } else { + Err("--output requires a path argument")?; + } + } + "--help" | "-h" => { + println!("Usage: gen_cli_reference [OPTIONS]"); + println!(); + println!("Options:"); + println!(" -o, --output Output path for CLI reference (default: docs/user-docs/src/cli-reference.md)"); + println!(" -h, --help Print this help"); + return Ok(()); + } + _ => { + Err(format!("Unknown argument: {}", args[i]))?; + } + } + } + // Find the workspace root let workspace_root = find_workspace_root(); - // Generate the CLI reference markdown - let generated_markdown = generate_cli_reference(); + // Generate the CLI reference markdown using the actual CLI definition + let generated_markdown = pdftract_cli::generate_cli_markdown(); - // Write to docs/user-docs/src/cli-reference.md - let cli_ref_path = workspace_root.join("docs/user-docs/src/cli-reference.md"); + // Determine output path + let cli_ref_path = if let Some(path) = output_path { + // If path is relative, resolve it from workspace root + if path.is_absolute() { + path + } else { + workspace_root.join(&path) + } + } else { + workspace_root.join("docs/user-docs/src/cli-reference.md") + }; // Create the directory if it doesn't exist if let Some(parent) = cli_ref_path.parent() { @@ -43,7 +81,9 @@ fn main() -> Result<(), Box> { // Add autogen notice at the top final_output.push_str("> This page is auto-generated from the clap command tree.\n"); final_output.push_str("> Run `cargo run --manifest-path=xtask/Cargo.toml --bin gen_cli_reference` to regenerate.\n\n"); - final_output.push_str(generated_markdown.trim_end()); + + // Add the generated markdown + final_output.push_str(&generated_markdown); final_output.push_str("\n\n"); final_output.push_str(AUTOGEN_END_MARKER); final_output.push_str("\n\n"); @@ -105,1144 +145,3 @@ fn find_workspace_root() -> PathBuf { } } } - -/// Generate CLI reference markdown using clap-markdown. -/// -/// This function creates a minimal clap Command that matches the pdftract CLI -/// structure and generates comprehensive markdown documentation. -fn generate_cli_reference() -> String { - use clap::{Command, Arg, ArgAction, ValueHint}; - - let mut cmd = Command::new("pdftract") - .about("pdftract CLI - PDF extraction and conformance testing") - .long_about( - "pdftract is a command-line tool for extracting text and structure from PDF files.\n\ - It supports JSON, Markdown, plain text, and NDJSON output formats, with\n\ - advanced features like OCR, document classification, and conformance testing." - ) - .version(env!("CARGO_PKG_VERSION")) - .arg( - Arg::new("help") - .short('h') - .long("help") - .action(ArgAction::Help) - .global(true) - .help("Print help information") - ) - .arg( - Arg::new("version") - .short('V') - .long("version") - .action(ArgAction::Version) - .global(true) - .help("Print version information") - ); - - // extract subcommand - cmd = cmd.subcommand( - Command::new("extract") - .about("Extract text and structure from a PDF file") - .long_about( - "Extract content from PDF files in multiple formats.\n\ - Supports local files, remote URLs, and stdin input." - ) - .arg( - Arg::new("input") - .help("Path to the PDF file (use '-' for stdin)") - .value_hint(ValueHint::FilePath) - .required(true) - ) - .arg( - Arg::new("password_stdin") - .long("password-stdin") - .help("Read password from stdin (one line, terminated by newline)") - .conflicts_with("password") - ) - .arg( - Arg::new("password") - .long("password") - .value_name("PASSWORD") - .help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)") - .conflicts_with("password_stdin") - ) - .arg( - Arg::new("header") - .long("header") - .value_name("HEADER:VALUE") - .action(ArgAction::Append) - .help("Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE)") - ) - .arg( - Arg::new("pages") - .long("pages") - .value_name("RANGE") - .help("Page range to extract (1-based, comma-separated: 1-5,7,12-)") - ) - .arg( - Arg::new("json") - .long("json") - .value_name("PATH") - .action(ArgAction::Append) - .help("Output JSON to PATH (use '-' for stdout)") - ) - .arg( - Arg::new("md") - .long("md") - .value_name("PATH") - .action(ArgAction::Append) - .help("Output Markdown to PATH (use '-' for stdout)") - ) - .arg( - Arg::new("text") - .long("text") - .value_name("PATH") - .action(ArgAction::Append) - .help("Output plain text to PATH (use '-' for stdout)") - ) - .arg( - Arg::new("ndjson") - .long("ndjson") - .action(ArgAction::SetTrue) - .help("Output NDJSON to stdout (mutually exclusive with other formats)") - .conflicts_with_all(["json", "md", "text", "format"]) - ) - .arg( - Arg::new("format") - .long("format") - .value_name("FORMATS") - .value_delimiter(',') - .action(ArgAction::Append) - .help("Output formats (comma-separated: json,markdown,text,ndjson)") - ) - .arg( - Arg::new("output") - .short('o') - .long("output") - .value_name("BASE") - .help("Base path for auto-named outputs (used with --format)") - ) - .arg( - Arg::new("receipts") - .long("receipts") - .value_name("MODE") - .default_value("off") - .value_parser(["off", "lite", "svg"]) - .help("Receipt mode: off (default), lite, or svg") - ) - .arg( - Arg::new("ocr") - .long("ocr") - .action(ArgAction::SetTrue) - .help("Enable OCR for scanned pages (requires 'ocr' feature)") - ) - .arg( - Arg::new("ocr_language") - .long("ocr-language") - .value_name("LANGS") - .value_delimiter(',') - .action(ArgAction::Append) - .help("OCR language codes (comma-separated, e.g., 'eng,fra,deu')") - ) - .arg( - Arg::new("cache_dir") - .long("cache-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Enable cache at this directory (creates if absent)") - ) - .arg( - Arg::new("cache_size") - .long("cache-size") - .value_name("SIZE") - .default_value("1 GiB") - .help("Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes)") - ) - .arg( - Arg::new("no_cache") - .long("no-cache") - .action(ArgAction::SetTrue) - .help("Disable cache for this extraction (even if --cache-dir is set)") - ) - .arg( - Arg::new("md_anchors") - .long("md-anchors") - .action(ArgAction::SetTrue) - .help("Emit HTML comment anchors before each block in Markdown output") - ) - .arg( - Arg::new("md_no_page_breaks") - .long("md-no-page-breaks") - .action(ArgAction::SetTrue) - .help("Suppress page-break horizontal rules between pages") - ) - .arg( - Arg::new("auto") - .long("auto") - .action(ArgAction::SetTrue) - .help("Auto-detect document type and apply appropriate profile") - ) - .arg( - Arg::new("profile") - .long("profile") - .value_name("NAME|PATH") - .help("Force-apply a specific profile (by name or YAML file path)") - ) - .arg( - Arg::new("include_headers") - .long("include-headers") - .action(ArgAction::SetTrue) - .help("Include header blocks in output") - ) - .arg( - Arg::new("include_footers") - .long("include-footers") - .action(ArgAction::SetTrue) - .help("Include footer blocks in output") - ) - .arg( - Arg::new("include_headers_footers") - .long("include-headers-footers") - .action(ArgAction::SetTrue) - .help("Include both header and footer blocks in output") - ) - .arg( - Arg::new("include_invisible_text") - .long("include-invisible-text") - .action(ArgAction::SetTrue) - .help("Include invisible text spans in output (rendering_mode == 3)") - ) - .arg( - Arg::new("include_hidden_layers") - .long("include-hidden-layers") - .action(ArgAction::SetTrue) - .help("Include hidden-layer text spans in output (OCG-controlled)") - ) - .arg( - Arg::new("include_watermarks") - .long("include-watermarks") - .action(ArgAction::SetTrue) - .help("Include watermark blocks in output (no-op until Phase 7)") - ) - ); - - // classify subcommand - cmd = cmd.subcommand( - Command::new("classify") - .about("Classify document type") - .long_about( - "Runs metadata + signal extraction to classify document type.\n\ - Not full text extraction - suitable for quick categorization." - ) - .arg( - Arg::new("input") - .help("Path to the PDF file") - .value_hint(ValueHint::FilePath) - .required(true) - ) - .arg( - Arg::new("password_stdin") - .long("password-stdin") - .help("Read password from stdin (one line, terminated by newline)") - .conflicts_with("password") - ) - .arg( - Arg::new("password") - .long("password") - .value_name("PASSWORD") - .help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)") - .conflicts_with("password_stdin") - ) - .arg( - Arg::new("profiles") - .long("profiles") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Directory containing custom profile YAML files") - ) - .arg( - Arg::new("pretty") - .long("pretty") - .action(ArgAction::SetTrue) - .help("Pretty-print JSON output") - ) - .arg( - Arg::new("top_k") - .long("top-k") - .value_name("N") - .default_value("0") - .help("Number of top reasons to include (default: all)") - ) - .arg( - Arg::new("exit_on_unknown") - .long("exit-on-unknown") - .action(ArgAction::SetTrue) - .help("Exit with code 1 if document type is unknown") - ) - ); - - // grep subcommand - cmd = cmd.subcommand( - Command::new("grep") - .about("Search for text patterns in PDF files") - .long_about( - "Search for text patterns with bounding-box results.\n\ - Requires the 'grep' feature flag." - ) - .arg( - Arg::new("pattern") - .help("Regular expression pattern to search for") - .required(true) - ) - .arg( - Arg::new("paths") - .help("PDF files or directories to search") - .value_hint(ValueHint::FilePath) - .action(ArgAction::Append) - .required(true) - ) - .arg( - Arg::new("context") - .short('C') - .long("context") - .value_name("LINES") - .default_value("0") - .help("Number of context lines to show") - ) - .arg( - Arg::new("ignore_case") - .short('i') - .long("ignore-case") - .action(ArgAction::SetTrue) - .help("Case-insensitive search") - ) - .arg( - Arg::new("json") - .long("json") - .action(ArgAction::SetTrue) - .help("Output results as JSON") - ) - ); - - // inspect subcommand - cmd = cmd.subcommand( - Command::new("inspect") - .about("Inspect a PDF file in a local web browser") - .long_about( - "Launch a local web server with debugging overlays for PDF inspection.\n\ - Provides visual feedback on extraction accuracy and layout analysis.\n\ - Requires the 'inspect' feature flag." - ) - .arg( - Arg::new("file") - .value_name("FILE") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the PDF file to inspect") - ) - .arg( - Arg::new("port") - .short('p') - .long("port") - .value_name("PORT") - .default_value("7676") - .help("Port to bind the inspector server (default: 7676)") - ) - .arg( - Arg::new("bind") - .short('b') - .long("bind") - .value_name("ADDR") - .default_value("127.0.0.1") - .help("Bind address for the inspector server (default: 127.0.0.1)") - ) - .arg( - Arg::new("auth_token") - .long("auth-token") - .value_name("TOKEN") - .help("Authentication token for non-loopback binds") - ) - .arg( - Arg::new("no_open") - .long("no-open") - .action(ArgAction::SetTrue) - .help("Suppress automatic browser launch") - ) - .arg( - Arg::new("compare") - .long("compare") - .value_name("FILE") - .value_hint(ValueHint::FilePath) - .help("Optional second PDF file for comparative debugging") - ) - .arg( - Arg::new("audit_log") - .long("audit-log") - .value_name("FILE") - .value_hint(ValueHint::FilePath) - .help("Write per-request audit log to FILE (NDJSON; use \"-\" for stdout)") - ) - ); - - // serve subcommand - cmd = cmd.subcommand( - Command::new("serve") - .about("Start the HTTP server for extraction") - .long_about( - "Start an HTTP server for PDF extraction via REST API.\n\n\ - **Security Model:** pdftract serve has no built-in authentication. \ - Deploy behind a reverse proxy (nginx, Traefik, Caddy) for production use.\n\n\ - **Endpoints:**\n\ - - POST /extract - Extract PDF and return JSON with metadata\n\ - - POST /extract/text - Extract PDF and return plain text\n\ - - POST /extract/stream - Extract PDF and return streaming NDJSON\n\ - - GET /health - Health check\n\n\ - Requires the 'serve' feature flag." - ) - .arg( - Arg::new("bind") - .short('b') - .long("bind") - .value_name("ADDR") - .default_value("127.0.0.1:8080") - .help("Bind address (e.g., \"127.0.0.1:8080\", \"[::1]:9000\", \"0.0.0.0:3000\")") - ) - .arg( - Arg::new("cache_dir") - .long("cache-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Enable cache at this directory") - ) - .arg( - Arg::new("cache_size") - .long("cache-size") - .value_name("SIZE") - .default_value("1 GiB") - .help("Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes)") - ) - .arg( - Arg::new("no_cache") - .long("no-cache") - .action(ArgAction::SetTrue) - .help("Disable cache") - ) - .arg( - Arg::new("max_upload_mb") - .long("max-upload-mb") - .value_name("MB") - .default_value("256") - .help("Maximum request body size in MB (default: 256, max: 4096)") - ) - .arg( - Arg::new("max_decompress_gb") - .long("max-decompress-gb") - .value_name("GB") - .default_value("1") - .help("Maximum decompression size in GB (default: 1)") - ) - .arg( - Arg::new("audit_log") - .long("audit-log") - .value_name("FILE") - .value_hint(ValueHint::FilePath) - .help("Write per-request audit log to FILE (NDJSON; use \"-\" for stdout)") - ) - .arg( - Arg::new("trust_forwarded_for") - .long("trust-forwarded-for") - .action(ArgAction::SetTrue) - .help("Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)") - ) - .arg( - Arg::new("profile_dir") - .long("profile-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Directory containing custom profile YAML files (repeatable)") - ) - .arg( - Arg::new("profile_hot_reload") - .long("profile-hot-reload") - .action(ArgAction::SetTrue) - .help("Enable hot-reload for profiles (re-read directory on every request)") - ) - ); - - // mcp subcommand - cmd = cmd.subcommand( - Command::new("mcp") - .about("Start the MCP (Model Context Protocol) server") - .long_about( - "Start an MCP server for AI assistant integration.\n\n\ - Per ADR-006: stdio and HTTP transports are mutually exclusive.\n\ - Exactly one transport must be selected per invocation.\n\n\ - Requires the 'mcp' feature flag." - ) - .arg( - Arg::new("stdio") - .long("stdio") - .action(ArgAction::SetTrue) - .help("Use stdio transport (for Claude Desktop, Claude Code, Continue, Cursor)") - .conflicts_with("bind") - ) - .arg( - Arg::new("bind") - .short('b') - .long("bind") - .value_name("ADDR") - .help("Bind address for the MCP server (enables HTTP+SSE transport)") - .conflicts_with("stdio") - ) - .arg( - Arg::new("auth_token_file") - .long("auth-token-file") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .help("Path to a file containing the bearer token (RECOMMENDED)") - .conflicts_with("auth_token") - ) - .arg( - Arg::new("auth_token") - .long("auth-token") - .value_name("TOKEN") - .help("Bearer token for authentication (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_TOKEN=1)") - .conflicts_with("auth_token_file") - ) - .arg( - Arg::new("max_upload_mb") - .long("max-upload-mb") - .value_name("MB") - .default_value("256") - .help("Maximum request body size in MB (default: 256)") - ) - .arg( - Arg::new("root") - .long("root") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Root directory for local filesystem access (enforces path-traversal protection)") - ) - .arg( - Arg::new("audit_log") - .long("audit-log") - .value_name("FILE") - .value_hint(ValueHint::FilePath) - .help("Write per-request audit log to FILE (NDJSON; use \"-\" for stdout)") - ) - ); - - // cache subcommand - let mut cache_cmd = Command::new("cache") - .about("Manage the extraction cache") - .long_about( - "Manage the content-addressed extraction cache.\n\ - Cache entries are stored by PDF hash and version constraint.\n\ - Requires the 'cache' feature flag." - ); - - cache_cmd = cache_cmd.subcommand( - Command::new("stats") - .about("Show cache statistics") - .arg( - Arg::new("dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .required(true) - .help("Path to the cache directory") - ) - .arg( - Arg::new("json") - .long("json") - .action(ArgAction::SetTrue) - .help("Output in JSON format") - ) - ); - - cache_cmd = cache_cmd.subcommand( - Command::new("clear") - .about("Clear all cache entries") - .long_about("Clear all cache entries (preserves index.json and sentinel)") - .arg( - Arg::new("dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .required(true) - .help("Path to the cache directory") - ) - .arg( - Arg::new("yes") - .short('y') - .long("yes") - .action(ArgAction::SetTrue) - .help("Skip confirmation prompt") - ) - ); - - cache_cmd = cache_cmd.subcommand( - Command::new("purge") - .about("Purge old cache entries") - .arg( - Arg::new("dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .required(true) - .help("Path to the cache directory") - ) - .arg( - Arg::new("older_than") - .long("older-than") - .value_name("DURATION") - .help("Delete entries older than this duration (e.g., \"30d\", \"7d\", \"1h\")") - ) - .arg( - Arg::new("version") - .long("version") - .value_name("CONSTRAINT") - .help("Delete entries matching this version constraint (e.g., \"<1.0.0\")") - ) - ); - - cmd = cmd.subcommand(cache_cmd); - - // profiles subcommand - let mut profiles_cmd = Command::new("profiles") - .about("Manage document type profiles") - .long_about( - "Manage document type profiles for classification and extraction tuning.\n\ - Requires the 'profiles' feature flag." - ); - - profiles_cmd = profiles_cmd.subcommand( - Command::new("list") - .about("List all available profiles") - ); - - profiles_cmd = profiles_cmd.subcommand( - Command::new("show") - .about("Show a profile's YAML content") - .arg( - Arg::new("name_or_path") - .value_name("NAME|PATH") - .required(true) - .help("Profile name or path to YAML file") - ) - ); - - profiles_cmd = profiles_cmd.subcommand( - Command::new("export") - .about("Export a built-in profile to stdout") - .arg( - Arg::new("name") - .value_name("NAME") - .required(true) - .help("Name of the built-in profile to export") - ) - ); - - profiles_cmd = profiles_cmd.subcommand( - Command::new("install") - .about("Install a profile to the user config directory") - .arg( - Arg::new("path") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the profile YAML file to install") - ) - ); - - profiles_cmd = profiles_cmd.subcommand( - Command::new("validate") - .about("Validate a profile file") - .arg( - Arg::new("path") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the profile YAML file to validate") - ) - ); - - cmd = cmd.subcommand(profiles_cmd); - - // doctor subcommand - cmd = cmd.subcommand( - Command::new("doctor") - .about("Check environment health and dependencies") - .long_about( - "Run environment health checks for pdftract dependencies and configuration.\n\n\ - Exit code policy:\n\ - - Exits 0 if no checks FAIL (WARN does not affect exit code)\n\ - - Exits 1 if any check FAILs\n\ - - Exits 2 on argument parse errors" - ) - .arg( - Arg::new("features") - .long("features") - .action(ArgAction::SetTrue) - .help("Print compiled features and exit") - ) - .arg( - Arg::new("json") - .long("json") - .action(ArgAction::SetTrue) - .help("Output results as JSON") - ) - .arg( - Arg::new("no_color") - .long("no-color") - .action(ArgAction::SetTrue) - .help("Disable colored output") - ) - .arg( - Arg::new("exit_on_fail") - .long("exit-on-fail") - .action(ArgAction::SetTrue) - .help("Explicit form of the default policy (exit 1 if any check FAILs)") - ) - .arg( - Arg::new("profile_dir") - .long("profile-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Verify the profile search path includes DIR") - ) - .arg( - Arg::new("cache_dir") - .long("cache-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .help("Verify DIR is writable and has sufficient space") - ) - .arg( - Arg::new("lang") - .long("lang") - .value_name("LANGS") - .value_delimiter(',') - .action(ArgAction::Append) - .help("Requested OCR languages (default: eng)") - ) - ); - - // hash subcommand - cmd = cmd.subcommand( - Command::new("hash") - .about("Compute the PDF structural fingerprint") - .long_about( - "Compute a structural hash/fingerprint of a PDF file.\n\ - This hash is based on the PDF's structure (xref, trailers, object\n\ - locations) rather than content, making it useful for identifying\n\ - identical documents with different metadata." - ) - .arg( - Arg::new("input") - .value_name("PATH|URL") - .required(true) - .help("Path to the PDF file or URL") - ) - .arg( - Arg::new("password") - .long("password") - .value_name("PASSWORD") - .help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)") - ) - .arg( - Arg::new("header") - .long("header") - .value_name("HEADER:VALUE") - .action(ArgAction::Append) - .help("Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE)") - ) - ); - - // verify-receipt subcommand - cmd = cmd.subcommand( - Command::new("verify-receipt") - .about("Verify a receipt against a PDF file") - .long_about( - "Verify a visual citation receipt against the original PDF.\n\ - Checks fingerprint, bbox IoU, and content hash.\n\ - Requires the 'receipts' feature flag." - ) - .arg( - Arg::new("pdf_path") - .value_name("FILE.pdf") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the PDF file to verify against") - ) - .arg( - Arg::new("receipt_path") - .value_name("RECEIPT.json") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the receipt JSON file, or \"-\" for stdin") - ) - .arg( - Arg::new("stdin") - .long("stdin") - .action(ArgAction::SetTrue) - .help("Read receipt from stdin (alternative to \"-\")") - .conflicts_with("receipt_path") - ) - .arg( - Arg::new("inline") - .long("inline") - .value_name("JSON") - .help("Receipt JSON as inline string (alternative to file path)") - .conflicts_with("receipt_path") - .conflicts_with("stdin") - ) - .arg( - Arg::new("json") - .long("json") - .action(ArgAction::SetTrue) - .help("Output machine-readable JSON result") - ) - .arg( - Arg::new("quiet") - .long("quiet") - .action(ArgAction::SetTrue) - .help("Suppress human-readable output (exit code only)") - .conflicts_with("json") - ) - .arg( - Arg::new("password") - .long("password") - .value_name("PASSWORD") - .help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)") - ) - .arg( - Arg::new("password_stdin") - .long("password-stdin") - .action(ArgAction::SetTrue) - .help("Read password from stdin (one line, terminated by newline)") - .conflicts_with("password") - ) - ); - - // conformance subcommand - cmd = cmd.subcommand( - Command::new("conformance") - .about("Run SDK conformance test suite") - .arg( - Arg::new("suite") - .short('s') - .long("suite") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .default_value("tests/sdk-conformance/cases.json") - .help("Path to the conformance suite JSON") - ) - .arg( - Arg::new("sdk") - .short('k') - .long("sdk") - .value_name("NAME") - .default_value("pdftract") - .help("SDK name") - ) - .arg( - Arg::new("version") - .short('v') - .long("version") - .value_name("VERSION") - .default_value("0.1.0") - .help("SDK version") - ) - .arg( - Arg::new("output") - .short('o') - .long("output") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .default_value("conformance-report.json") - .help("Output report path") - ) - ); - - // compare subcommand - cmd = cmd.subcommand( - Command::new("compare") - .about("Compare actual results against expected values") - .long_about( - "Compare actual extraction results against expected values with tolerances.\n\ - Used for conformance testing and validation." - ) - .arg( - Arg::new("actual") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the actual results JSON") - ) - .arg( - Arg::new("expected") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .required(true) - .help("Path to the expected results JSON") - ) - .arg( - Arg::new("tolerances") - .short('t') - .long("tolerances") - .value_name("PATH") - .value_hint(ValueHint::FilePath) - .help("Path to the tolerances JSON (optional)") - ) - .arg( - Arg::new("format") - .short('f') - .long("format") - .value_name("FORMAT") - .default_value("text") - .help("Output format (text, json)") - ) - ); - - // sdk subcommand - let mut sdk_cmd = Command::new("sdk") - .about("SDK code generation commands"); - - sdk_cmd = sdk_cmd.subcommand( - Command::new("codegen") - .about("Generate SDK skeleton from templates") - .arg( - Arg::new("lang") - .short('l') - .long("lang") - .value_name("LANG") - .required(true) - .help("Target language") - ) - .arg( - Arg::new("out") - .short('o') - .long("out") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .required(true) - .help("Output directory") - ) - .arg( - Arg::new("version") - .short('v') - .long("version") - .value_name("VERSION") - .default_value("0.1.0") - .help("Version string (defaults to current pdftract version)") - ) - ); - - sdk_cmd = sdk_cmd.subcommand( - Command::new("validate") - .about("Validate existing SDK against current generator output") - .arg( - Arg::new("lang") - .short('l') - .long("lang") - .value_name("LANG") - .required(true) - .help("Target language") - ) - .arg( - Arg::new("sdk_dir") - .short('d') - .long("sdk-dir") - .value_name("DIR") - .value_hint(ValueHint::DirPath) - .required(true) - .help("Path to existing SDK directory") - ) - ); - - cmd = cmd.subcommand(sdk_cmd); - - // migrate-schema subcommand - cmd = cmd.subcommand( - Command::new("migrate-schema") - .about("Migrate JSON output between schema versions") - .long_about( - "Migrate JSON output between schema versions.\n\ - Converts JSON from one schema version to another." - ) - .arg( - Arg::new("from") - .long("from") - .value_name("VERSION") - .required(true) - .help("Source schema version (e.g., \"1.0\", \"1.1\")") - ) - .arg( - Arg::new("to") - .long("to") - .value_name("VERSION") - .required(true) - .help("Target schema version (e.g., \"1.0\", \"1.1\")") - ) - .arg( - Arg::new("input") - .value_name("FILE") - .default_value("-") - .help("Input JSON file (use '-' for stdin)") - ) - .arg( - Arg::new("output") - .short('o') - .long("output") - .value_name("FILE") - .default_value("-") - .help("Output JSON file (use '-' for stdout)") - ) - .arg( - Arg::new("pretty") - .short('p') - .long("pretty") - .action(ArgAction::SetTrue) - .help("Pretty-print output JSON") - ) - ); - - // list-diagnostics subcommand - cmd = cmd.subcommand( - Command::new("list-diagnostics") - .about("List all diagnostic codes with their metadata") - .long_about( - "List all diagnostic codes emitted during PDF parsing and extraction.\n\ - Each diagnostic includes severity, recoverable flag, phase origin,\n\ - and suggested action." - ) - ); - - // explain-diagnostic subcommand - cmd = cmd.subcommand( - Command::new("explain-diagnostic") - .about("Explain a specific diagnostic code in detail") - .arg( - Arg::new("code") - .value_name("CODE") - .required(true) - .help("Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB)") - ) - ); - - // Generate markdown using clap-markdown - // clap-markdown 0.1 uses a CommandFactory trait, so we need to capture stdout - let mut buffer = String::new(); - buffer.push_str("# CLI Reference\n\n"); - buffer.push_str("This page provides comprehensive documentation for all pdftract CLI commands and flags.\n\n"); - buffer.push_str("## Usage\n\n"); - buffer.push_str("```bash\npdftract [OPTIONS] \n```\n\n"); - buffer.push_str("## Global Options\n\n"); - buffer.push_str("These options are available across all subcommands:\n\n"); - buffer.push_str("- `-h, --help` - Print help information\n"); - buffer.push_str("- `-V, --version` - Print version information\n\n"); - buffer.push_str("## Commands\n\n"); - - // Use clap-markdown's CommandFactory API - // Since the cmd we built implements Command, we need to convert it - // clap-markdown 0.1 expects to call .command() on a CommandFactory type - // We'll manually generate the markdown for our custom command - - fn command_to_markdown(cmd: &Command, depth: usize) -> String { - let mut result = String::new(); - let indent = " ".repeat(depth * 2); - - // Command name and description - if depth == 0 { - result.push_str(&format!("### `{}`\n\n", cmd.get_name())); - } else { - result.push_str(&format!("{}#### `{}`\n\n", indent, cmd.get_name())); - } - - // About - if let Some(about) = cmd.get_about() { - result.push_str(&format!("{}\n\n", about)); - } - - // Long about - if let Some(long_about) = cmd.get_long_about() { - if let Some(about) = cmd.get_about() { - if long_about != about { - result.push_str(&format!("{}\n\n", long_about)); - } - } else { - result.push_str(&format!("{}\n\n", long_about)); - } - } - - // Usage - let mut usage = String::new(); - usage.push_str(&cmd.get_name()); - if let Some(subcommand) = cmd.get_subcommands().find(|s| s.get_name() == "help") { - // Skip help subcommand - } - result.push_str(&format!("**Usage:**\n\n```bash\npdftract {}\n```\n\n", usage)); - - // Arguments - let positional_args: Vec<_> = cmd.get_positionals() - .filter(|a| !a.is_hide_set()) - .collect(); - - if !positional_args.is_empty() { - result.push_str("**Arguments:**\n\n"); - for arg in positional_args { - result.push_str(&format!("- `<{}>`", arg.get_id())); - if let Some(help) = arg.get_help() { - result.push_str(&format!(" - {}", help)); - } - if arg.is_required_set() { - result.push_str(" (required)"); - } - result.push_str("\n"); - } - result.push_str("\n"); - } - - // Options - let options: Vec<_> = cmd.get_opts() - .filter(|o| !o.is_hide_set()) - .collect(); - - if !options.is_empty() { - result.push_str("**Options:**\n\n"); - for opt in options { - let mut names = Vec::new(); - if let Some(short) = opt.get_short() { - names.push(format!("-{}", short)); - } - if let Some(long) = opt.get_long() { - names.push(format!("--{}", long)); - } - result.push_str(&format!("- `{}`", names.join(", "))); - if let Some(value_name) = opt.get_value_names() { - result.push_str(&format!(" <{}>", value_name.join(" "))); - } - if let Some(help) = opt.get_help() { - result.push_str(&format!(" - {}", help)); - } - if let Some(default) = opt.get_default_values().first() { - result.push_str(&format!(" (default: `{}`)", default.to_string_lossy())); - } - result.push_str("\n"); - } - result.push_str("\n"); - } - - // Subcommands - let subcommands: Vec<_> = cmd.get_subcommands() - .filter(|s| !s.is_hide_set()) - .collect(); - - for subcmd in subcommands { - result.push_str(&command_to_markdown(subcmd, depth + 1)); - } - - result - } - - buffer.push_str(&command_to_markdown(&cmd, 0)); - - buffer -}