- Add worked example to Glyph struct showing all 11 fields - Add worked example to Span struct showing all 10 fields - Examples use rust,no_run for internal dependencies - cargo doc passes with docs.rs feature set - Verification note added at notes/pdftract-3eohy.md Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1104 lines
40 KiB
Rust
1104 lines
40 KiB
Rust
//! Generate CLI Reference documentation using clap-markdown.
|
|
//!
|
|
//! This binary generates the canonical CLI Reference documentation for pdftract,
|
|
//! which is checked into the repository at docs/user-docs/src/cli-reference.md.
|
|
//!
|
|
//! Usage: cargo run --manifest-path=xtask/Cargo.toml --bin gen_cli_reference
|
|
|
|
use std::fs;
|
|
use std::path::PathBuf;
|
|
|
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
// Find the workspace root
|
|
let workspace_root = find_workspace_root();
|
|
|
|
// Generate the CLI reference markdown
|
|
let cli_reference_md = generate_cli_reference();
|
|
|
|
// Write to docs/user-docs/src/cli-reference.md
|
|
let cli_ref_path = workspace_root.join("docs/user-docs/src/cli-reference.md");
|
|
|
|
// Create the directory if it doesn't exist
|
|
if let Some(parent) = cli_ref_path.parent() {
|
|
fs::create_dir_all(parent)?;
|
|
}
|
|
|
|
fs::write(&cli_ref_path, cli_reference_md)?;
|
|
|
|
println!("Generated CLI reference at: {}", cli_ref_path.display());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Find the workspace root by searching for Cargo.toml
|
|
fn find_workspace_root() -> PathBuf {
|
|
let mut current = std::env::current_dir().unwrap();
|
|
|
|
// If we're in the xtask directory, go to parent
|
|
if current.ends_with("xtask") {
|
|
current = current.parent().unwrap().to_path_buf();
|
|
}
|
|
|
|
// Search upward for Cargo.toml with workspace members
|
|
loop {
|
|
let cargo_toml = current.join("Cargo.toml");
|
|
if cargo_toml.exists() {
|
|
let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
|
|
if content.contains("[workspace]") {
|
|
return current;
|
|
}
|
|
}
|
|
|
|
// Move to parent directory
|
|
match current.parent() {
|
|
Some(parent) => current = parent.to_path_buf(),
|
|
None => panic!("Could not find workspace root"),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Generate CLI reference markdown using clap-markdown.
|
|
///
|
|
/// This function creates a minimal clap Command that matches the pdftract CLI
|
|
/// structure and generates comprehensive markdown documentation.
|
|
fn generate_cli_reference() -> String {
|
|
use clap::{Command, Arg, ArgAction, ValueHint};
|
|
|
|
let mut cmd = Command::new("pdftract")
|
|
.about("pdftract CLI - PDF extraction and conformance testing")
|
|
.long_about(
|
|
"pdftract is a command-line tool for extracting text and structure from PDF files.\n\
|
|
It supports JSON, Markdown, plain text, and NDJSON output formats, with\n\
|
|
advanced features like OCR, document classification, and conformance testing."
|
|
)
|
|
.version(env!("CARGO_PKG_VERSION"))
|
|
.arg(
|
|
Arg::new("help")
|
|
.short('h')
|
|
.long("help")
|
|
.action(ArgAction::Help)
|
|
.global(true)
|
|
.help("Print help information")
|
|
)
|
|
.arg(
|
|
Arg::new("version")
|
|
.short('V')
|
|
.long("version")
|
|
.action(ArgAction::Version)
|
|
.global(true)
|
|
.help("Print version information")
|
|
);
|
|
|
|
// extract subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("extract")
|
|
.about("Extract text and structure from a PDF file")
|
|
.long_about(
|
|
"Extract content from PDF files in multiple formats.\n\
|
|
Supports local files, remote URLs, and stdin input."
|
|
)
|
|
.arg(
|
|
Arg::new("input")
|
|
.help("Path to the PDF file (use '-' for stdin)")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
)
|
|
.arg(
|
|
Arg::new("password_stdin")
|
|
.long("password-stdin")
|
|
.help("Read password from stdin (one line, terminated by newline)")
|
|
.conflicts_with("password")
|
|
)
|
|
.arg(
|
|
Arg::new("password")
|
|
.long("password")
|
|
.value_name("PASSWORD")
|
|
.help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)")
|
|
.conflicts_with("password_stdin")
|
|
)
|
|
.arg(
|
|
Arg::new("header")
|
|
.long("header")
|
|
.value_name("HEADER:VALUE")
|
|
.action(ArgAction::Append)
|
|
.help("Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE)")
|
|
)
|
|
.arg(
|
|
Arg::new("pages")
|
|
.long("pages")
|
|
.value_name("RANGE")
|
|
.help("Page range to extract (1-based, comma-separated: 1-5,7,12-)")
|
|
)
|
|
.arg(
|
|
Arg::new("json")
|
|
.long("json")
|
|
.value_name("PATH")
|
|
.action(ArgAction::Append)
|
|
.help("Output JSON to PATH (use '-' for stdout)")
|
|
)
|
|
.arg(
|
|
Arg::new("md")
|
|
.long("md")
|
|
.value_name("PATH")
|
|
.action(ArgAction::Append)
|
|
.help("Output Markdown to PATH (use '-' for stdout)")
|
|
)
|
|
.arg(
|
|
Arg::new("text")
|
|
.long("text")
|
|
.value_name("PATH")
|
|
.action(ArgAction::Append)
|
|
.help("Output plain text to PATH (use '-' for stdout)")
|
|
)
|
|
.arg(
|
|
Arg::new("ndjson")
|
|
.long("ndjson")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Output NDJSON to stdout (mutually exclusive with other formats)")
|
|
.conflicts_with_all(["json", "md", "text", "format"])
|
|
)
|
|
.arg(
|
|
Arg::new("format")
|
|
.long("format")
|
|
.value_name("FORMATS")
|
|
.value_delimiter(',')
|
|
.action(ArgAction::Append)
|
|
.help("Output formats (comma-separated: json,markdown,text,ndjson)")
|
|
)
|
|
.arg(
|
|
Arg::new("output")
|
|
.short('o')
|
|
.long("output")
|
|
.value_name("BASE")
|
|
.help("Base path for auto-named outputs (used with --format)")
|
|
)
|
|
.arg(
|
|
Arg::new("receipts")
|
|
.long("receipts")
|
|
.value_name("MODE")
|
|
.default_value("off")
|
|
.value_parser(["off", "lite", "svg"])
|
|
.help("Receipt mode: off (default), lite, or svg")
|
|
)
|
|
.arg(
|
|
Arg::new("ocr")
|
|
.long("ocr")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Enable OCR for scanned pages (requires 'ocr' feature)")
|
|
)
|
|
.arg(
|
|
Arg::new("ocr_language")
|
|
.long("ocr-language")
|
|
.value_name("LANGS")
|
|
.value_delimiter(',')
|
|
.action(ArgAction::Append)
|
|
.help("OCR language codes (comma-separated, e.g., 'eng,fra,deu')")
|
|
)
|
|
.arg(
|
|
Arg::new("cache_dir")
|
|
.long("cache-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Enable cache at this directory (creates if absent)")
|
|
)
|
|
.arg(
|
|
Arg::new("cache_size")
|
|
.long("cache-size")
|
|
.value_name("SIZE")
|
|
.default_value("1 GiB")
|
|
.help("Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes)")
|
|
)
|
|
.arg(
|
|
Arg::new("no_cache")
|
|
.long("no-cache")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Disable cache for this extraction (even if --cache-dir is set)")
|
|
)
|
|
.arg(
|
|
Arg::new("md_anchors")
|
|
.long("md-anchors")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Emit HTML comment anchors before each block in Markdown output")
|
|
)
|
|
.arg(
|
|
Arg::new("auto")
|
|
.long("auto")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Auto-detect document type and apply appropriate profile")
|
|
)
|
|
.arg(
|
|
Arg::new("profile")
|
|
.long("profile")
|
|
.value_name("NAME|PATH")
|
|
.help("Force-apply a specific profile (by name or YAML file path)")
|
|
)
|
|
.arg(
|
|
Arg::new("include_headers")
|
|
.long("include-headers")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include header blocks in output")
|
|
)
|
|
.arg(
|
|
Arg::new("include_footers")
|
|
.long("include-footers")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include footer blocks in output")
|
|
)
|
|
.arg(
|
|
Arg::new("include_headers_footers")
|
|
.long("include-headers-footers")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include both header and footer blocks in output")
|
|
)
|
|
.arg(
|
|
Arg::new("include_invisible_text")
|
|
.long("include-invisible-text")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include invisible text spans in output (rendering_mode == 3)")
|
|
)
|
|
.arg(
|
|
Arg::new("include_hidden_layers")
|
|
.long("include-hidden-layers")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include hidden-layer text spans in output (OCG-controlled)")
|
|
)
|
|
.arg(
|
|
Arg::new("include_watermarks")
|
|
.long("include-watermarks")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Include watermark blocks in output (no-op until Phase 7)")
|
|
)
|
|
);
|
|
|
|
// classify subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("classify")
|
|
.about("Classify document type")
|
|
.long_about(
|
|
"Runs metadata + signal extraction to classify document type.\n\
|
|
Not full text extraction - suitable for quick categorization."
|
|
)
|
|
.arg(
|
|
Arg::new("input")
|
|
.help("Path to the PDF file")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
)
|
|
.arg(
|
|
Arg::new("password_stdin")
|
|
.long("password-stdin")
|
|
.help("Read password from stdin (one line, terminated by newline)")
|
|
.conflicts_with("password")
|
|
)
|
|
.arg(
|
|
Arg::new("password")
|
|
.long("password")
|
|
.value_name("PASSWORD")
|
|
.help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)")
|
|
.conflicts_with("password_stdin")
|
|
)
|
|
.arg(
|
|
Arg::new("profiles")
|
|
.long("profiles")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Directory containing custom profile YAML files")
|
|
)
|
|
.arg(
|
|
Arg::new("pretty")
|
|
.long("pretty")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Pretty-print JSON output")
|
|
)
|
|
.arg(
|
|
Arg::new("top_k")
|
|
.long("top-k")
|
|
.value_name("N")
|
|
.default_value("0")
|
|
.help("Number of top reasons to include (default: all)")
|
|
)
|
|
.arg(
|
|
Arg::new("exit_on_unknown")
|
|
.long("exit-on-unknown")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Exit with code 1 if document type is unknown")
|
|
)
|
|
);
|
|
|
|
// grep subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("grep")
|
|
.about("Search for text patterns in PDF files")
|
|
.long_about(
|
|
"Search for text patterns with bounding-box results.\n\
|
|
Requires the 'grep' feature flag."
|
|
)
|
|
.arg(
|
|
Arg::new("pattern")
|
|
.help("Regular expression pattern to search for")
|
|
.required(true)
|
|
)
|
|
.arg(
|
|
Arg::new("paths")
|
|
.help("PDF files or directories to search")
|
|
.value_hint(ValueHint::FilePath)
|
|
.action(ArgAction::Append)
|
|
.required(true)
|
|
)
|
|
.arg(
|
|
Arg::new("context")
|
|
.short('C')
|
|
.long("context")
|
|
.value_name("LINES")
|
|
.default_value("0")
|
|
.help("Number of context lines to show")
|
|
)
|
|
.arg(
|
|
Arg::new("ignore_case")
|
|
.short('i')
|
|
.long("ignore-case")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Case-insensitive search")
|
|
)
|
|
.arg(
|
|
Arg::new("json")
|
|
.long("json")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Output results as JSON")
|
|
)
|
|
);
|
|
|
|
// inspect subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("inspect")
|
|
.about("Inspect a PDF file in a local web browser")
|
|
.long_about(
|
|
"Launch a local web server with debugging overlays for PDF inspection.\n\
|
|
Provides visual feedback on extraction accuracy and layout analysis.\n\
|
|
Requires the 'inspect' feature flag."
|
|
)
|
|
.arg(
|
|
Arg::new("input")
|
|
.help("Path to the PDF file")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
)
|
|
.arg(
|
|
Arg::new("bind")
|
|
.short('b')
|
|
.long("bind")
|
|
.value_name("ADDR")
|
|
.default_value("127.0.0.1:0")
|
|
.help("Bind address for the inspector server (use 0.0.0.0:0 for accessibility from other devices)")
|
|
)
|
|
.arg(
|
|
Arg::new("password")
|
|
.long("password")
|
|
.value_name("PASSWORD")
|
|
.help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)")
|
|
)
|
|
.arg(
|
|
Arg::new("ocr")
|
|
.long("ocr")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Enable OCR for scanned pages (requires 'ocr' feature)")
|
|
)
|
|
.arg(
|
|
Arg::new("no_browser")
|
|
.long("no-browser")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Don't automatically open browser")
|
|
)
|
|
);
|
|
|
|
// serve subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("serve")
|
|
.about("Start the HTTP server for extraction")
|
|
.long_about(
|
|
"Start an HTTP server for PDF extraction via REST API.\n\n\
|
|
**Security Model:** pdftract serve has no built-in authentication. \
|
|
Deploy behind a reverse proxy (nginx, Traefik, Caddy) for production use.\n\n\
|
|
**Endpoints:**\n\
|
|
- POST /extract - Extract PDF and return JSON with metadata\n\
|
|
- POST /extract/text - Extract PDF and return plain text\n\
|
|
- POST /extract/stream - Extract PDF and return streaming NDJSON\n\
|
|
- GET /health - Health check\n\n\
|
|
Requires the 'serve' feature flag."
|
|
)
|
|
.arg(
|
|
Arg::new("bind")
|
|
.short('b')
|
|
.long("bind")
|
|
.value_name("ADDR")
|
|
.default_value("127.0.0.1:8080")
|
|
.help("Bind address (e.g., \"127.0.0.1:8080\", \"[::1]:9000\", \"0.0.0.0:3000\")")
|
|
)
|
|
.arg(
|
|
Arg::new("cache_dir")
|
|
.long("cache-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Enable cache at this directory")
|
|
)
|
|
.arg(
|
|
Arg::new("cache_size")
|
|
.long("cache-size")
|
|
.value_name("SIZE")
|
|
.default_value("1 GiB")
|
|
.help("Set cache size limit (default 1 GiB; accepts KiB, MiB, GiB suffixes)")
|
|
)
|
|
.arg(
|
|
Arg::new("no_cache")
|
|
.long("no-cache")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Disable cache")
|
|
)
|
|
.arg(
|
|
Arg::new("max_upload_mb")
|
|
.long("max-upload-mb")
|
|
.value_name("MB")
|
|
.default_value("256")
|
|
.help("Maximum request body size in MB (default: 256, max: 4096)")
|
|
)
|
|
.arg(
|
|
Arg::new("max_decompress_gb")
|
|
.long("max-decompress-gb")
|
|
.value_name("GB")
|
|
.default_value("1")
|
|
.help("Maximum decompression size in GB (default: 1)")
|
|
)
|
|
.arg(
|
|
Arg::new("audit_log")
|
|
.long("audit-log")
|
|
.value_name("FILE")
|
|
.value_hint(ValueHint::FilePath)
|
|
.help("Write per-request audit log to FILE (NDJSON; use \"-\" for stdout)")
|
|
)
|
|
.arg(
|
|
Arg::new("trust_forwarded_for")
|
|
.long("trust-forwarded-for")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)")
|
|
)
|
|
.arg(
|
|
Arg::new("profile_dir")
|
|
.long("profile-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Directory containing custom profile YAML files (repeatable)")
|
|
)
|
|
.arg(
|
|
Arg::new("profile_hot_reload")
|
|
.long("profile-hot-reload")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Enable hot-reload for profiles (re-read directory on every request)")
|
|
)
|
|
);
|
|
|
|
// mcp subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("mcp")
|
|
.about("Start the MCP (Model Context Protocol) server")
|
|
.long_about(
|
|
"Start an MCP server for AI assistant integration.\n\n\
|
|
Per ADR-006: stdio and HTTP transports are mutually exclusive.\n\
|
|
Exactly one transport must be selected per invocation.\n\n\
|
|
Requires the 'mcp' feature flag."
|
|
)
|
|
.arg(
|
|
Arg::new("stdio")
|
|
.long("stdio")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Use stdio transport (for Claude Desktop, Claude Code, Continue, Cursor)")
|
|
.conflicts_with("bind")
|
|
)
|
|
.arg(
|
|
Arg::new("bind")
|
|
.short('b')
|
|
.long("bind")
|
|
.value_name("ADDR")
|
|
.help("Bind address for the MCP server (enables HTTP+SSE transport)")
|
|
.conflicts_with("stdio")
|
|
)
|
|
.arg(
|
|
Arg::new("auth_token_file")
|
|
.long("auth-token-file")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.help("Path to a file containing the bearer token (RECOMMENDED)")
|
|
.conflicts_with("auth_token")
|
|
)
|
|
.arg(
|
|
Arg::new("auth_token")
|
|
.long("auth-token")
|
|
.value_name("TOKEN")
|
|
.help("Bearer token for authentication (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_TOKEN=1)")
|
|
.conflicts_with("auth_token_file")
|
|
)
|
|
.arg(
|
|
Arg::new("max_upload_mb")
|
|
.long("max-upload-mb")
|
|
.value_name("MB")
|
|
.default_value("256")
|
|
.help("Maximum request body size in MB (default: 256)")
|
|
)
|
|
.arg(
|
|
Arg::new("root")
|
|
.long("root")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Root directory for local filesystem access (enforces path-traversal protection)")
|
|
)
|
|
.arg(
|
|
Arg::new("audit_log")
|
|
.long("audit-log")
|
|
.value_name("FILE")
|
|
.value_hint(ValueHint::FilePath)
|
|
.help("Write per-request audit log to FILE (NDJSON; use \"-\" for stdout)")
|
|
)
|
|
);
|
|
|
|
// cache subcommand
|
|
let mut cache_cmd = Command::new("cache")
|
|
.about("Manage the extraction cache")
|
|
.long_about(
|
|
"Manage the content-addressed extraction cache.\n\
|
|
Cache entries are stored by PDF hash and version constraint.\n\
|
|
Requires the 'cache' feature flag."
|
|
);
|
|
|
|
cache_cmd = cache_cmd.subcommand(
|
|
Command::new("stats")
|
|
.about("Show cache statistics")
|
|
.arg(
|
|
Arg::new("dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.required(true)
|
|
.help("Path to the cache directory")
|
|
)
|
|
.arg(
|
|
Arg::new("json")
|
|
.long("json")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Output in JSON format")
|
|
)
|
|
);
|
|
|
|
cache_cmd = cache_cmd.subcommand(
|
|
Command::new("clear")
|
|
.about("Clear all cache entries")
|
|
.long_about("Clear all cache entries (preserves index.json and sentinel)")
|
|
.arg(
|
|
Arg::new("dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.required(true)
|
|
.help("Path to the cache directory")
|
|
)
|
|
.arg(
|
|
Arg::new("yes")
|
|
.short('y')
|
|
.long("yes")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Skip confirmation prompt")
|
|
)
|
|
);
|
|
|
|
cache_cmd = cache_cmd.subcommand(
|
|
Command::new("purge")
|
|
.about("Purge old cache entries")
|
|
.arg(
|
|
Arg::new("dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.required(true)
|
|
.help("Path to the cache directory")
|
|
)
|
|
.arg(
|
|
Arg::new("older_than")
|
|
.long("older-than")
|
|
.value_name("DURATION")
|
|
.help("Delete entries older than this duration (e.g., \"30d\", \"7d\", \"1h\")")
|
|
)
|
|
.arg(
|
|
Arg::new("version")
|
|
.long("version")
|
|
.value_name("CONSTRAINT")
|
|
.help("Delete entries matching this version constraint (e.g., \"<1.0.0\")")
|
|
)
|
|
);
|
|
|
|
cmd = cmd.subcommand(cache_cmd);
|
|
|
|
// profiles subcommand
|
|
let mut profiles_cmd = Command::new("profiles")
|
|
.about("Manage document type profiles")
|
|
.long_about(
|
|
"Manage document type profiles for classification and extraction tuning.\n\
|
|
Requires the 'profiles' feature flag."
|
|
);
|
|
|
|
profiles_cmd = profiles_cmd.subcommand(
|
|
Command::new("list")
|
|
.about("List all available profiles")
|
|
);
|
|
|
|
profiles_cmd = profiles_cmd.subcommand(
|
|
Command::new("show")
|
|
.about("Show a profile's YAML content")
|
|
.arg(
|
|
Arg::new("name_or_path")
|
|
.value_name("NAME|PATH")
|
|
.required(true)
|
|
.help("Profile name or path to YAML file")
|
|
)
|
|
);
|
|
|
|
profiles_cmd = profiles_cmd.subcommand(
|
|
Command::new("export")
|
|
.about("Export a built-in profile to stdout")
|
|
.arg(
|
|
Arg::new("name")
|
|
.value_name("NAME")
|
|
.required(true)
|
|
.help("Name of the built-in profile to export")
|
|
)
|
|
);
|
|
|
|
profiles_cmd = profiles_cmd.subcommand(
|
|
Command::new("install")
|
|
.about("Install a profile to the user config directory")
|
|
.arg(
|
|
Arg::new("path")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the profile YAML file to install")
|
|
)
|
|
);
|
|
|
|
profiles_cmd = profiles_cmd.subcommand(
|
|
Command::new("validate")
|
|
.about("Validate a profile file")
|
|
.arg(
|
|
Arg::new("path")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the profile YAML file to validate")
|
|
)
|
|
);
|
|
|
|
cmd = cmd.subcommand(profiles_cmd);
|
|
|
|
// doctor subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("doctor")
|
|
.about("Check environment health and dependencies")
|
|
.long_about(
|
|
"Run environment health checks for pdftract dependencies and configuration.\n\n\
|
|
Exit code policy:\n\
|
|
- Exits 0 if no checks FAIL (WARN does not affect exit code)\n\
|
|
- Exits 1 if any check FAILs\n\
|
|
- Exits 2 on argument parse errors"
|
|
)
|
|
.arg(
|
|
Arg::new("features")
|
|
.long("features")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Print compiled features and exit")
|
|
)
|
|
.arg(
|
|
Arg::new("json")
|
|
.long("json")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Output results as JSON")
|
|
)
|
|
.arg(
|
|
Arg::new("no_color")
|
|
.long("no-color")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Disable colored output")
|
|
)
|
|
.arg(
|
|
Arg::new("exit_on_fail")
|
|
.long("exit-on-fail")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Explicit form of the default policy (exit 1 if any check FAILs)")
|
|
)
|
|
.arg(
|
|
Arg::new("profile_dir")
|
|
.long("profile-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Verify the profile search path includes DIR")
|
|
)
|
|
.arg(
|
|
Arg::new("cache_dir")
|
|
.long("cache-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.help("Verify DIR is writable and has sufficient space")
|
|
)
|
|
.arg(
|
|
Arg::new("lang")
|
|
.long("lang")
|
|
.value_name("LANGS")
|
|
.value_delimiter(',')
|
|
.action(ArgAction::Append)
|
|
.help("Requested OCR languages (default: eng)")
|
|
)
|
|
);
|
|
|
|
// hash subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("hash")
|
|
.about("Compute the PDF structural fingerprint")
|
|
.long_about(
|
|
"Compute a structural hash/fingerprint of a PDF file.\n\
|
|
This hash is based on the PDF's structure (xref, trailers, object\n\
|
|
locations) rather than content, making it useful for identifying\n\
|
|
identical documents with different metadata."
|
|
)
|
|
.arg(
|
|
Arg::new("input")
|
|
.value_name("PATH|URL")
|
|
.required(true)
|
|
.help("Path to the PDF file or URL")
|
|
)
|
|
.arg(
|
|
Arg::new("password")
|
|
.long("password")
|
|
.value_name("PASSWORD")
|
|
.help("PDF password (INSECURE: rejected unless PDFTRACT_INSECURE_CLI_PASSWORD=1)")
|
|
)
|
|
.arg(
|
|
Arg::new("header")
|
|
.long("header")
|
|
.value_name("HEADER:VALUE")
|
|
.action(ArgAction::Append)
|
|
.help("Custom HTTP headers for remote sources (repeatable; format: HEADER:VALUE)")
|
|
)
|
|
);
|
|
|
|
// verify-receipt subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("verify-receipt")
|
|
.about("Verify a receipt against a PDF file")
|
|
.long_about(
|
|
"Verify a visual citation receipt against the original PDF.\n\
|
|
Checks that quoted text appears at the expected locations.\n\
|
|
Requires the 'receipts' feature flag."
|
|
)
|
|
.arg(
|
|
Arg::new("receipt")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the receipt JSON file")
|
|
)
|
|
.arg(
|
|
Arg::new("pdf")
|
|
.long("pdf")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the original PDF file")
|
|
)
|
|
.arg(
|
|
Arg::new("tolerance")
|
|
.long("tolerance")
|
|
.value_name("PIXELS")
|
|
.default_value("10")
|
|
.help("Tolerance for bounding box matching in pixels")
|
|
)
|
|
.arg(
|
|
Arg::new("json")
|
|
.long("json")
|
|
.action(ArgAction::SetTrue)
|
|
.help("Output results as JSON")
|
|
)
|
|
);
|
|
|
|
// conformance subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("conformance")
|
|
.about("Run SDK conformance test suite")
|
|
.arg(
|
|
Arg::new("suite")
|
|
.short('s')
|
|
.long("suite")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.default_value("tests/sdk-conformance/cases.json")
|
|
.help("Path to the conformance suite JSON")
|
|
)
|
|
.arg(
|
|
Arg::new("sdk")
|
|
.short('k')
|
|
.long("sdk")
|
|
.value_name("NAME")
|
|
.default_value("pdftract")
|
|
.help("SDK name")
|
|
)
|
|
.arg(
|
|
Arg::new("version")
|
|
.short('v')
|
|
.long("version")
|
|
.value_name("VERSION")
|
|
.default_value("0.1.0")
|
|
.help("SDK version")
|
|
)
|
|
.arg(
|
|
Arg::new("output")
|
|
.short('o')
|
|
.long("output")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.default_value("conformance-report.json")
|
|
.help("Output report path")
|
|
)
|
|
);
|
|
|
|
// compare subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("compare")
|
|
.about("Compare actual results against expected values")
|
|
.long_about(
|
|
"Compare actual extraction results against expected values with tolerances.\n\
|
|
Used for conformance testing and validation."
|
|
)
|
|
.arg(
|
|
Arg::new("actual")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the actual results JSON")
|
|
)
|
|
.arg(
|
|
Arg::new("expected")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.required(true)
|
|
.help("Path to the expected results JSON")
|
|
)
|
|
.arg(
|
|
Arg::new("tolerances")
|
|
.short('t')
|
|
.long("tolerances")
|
|
.value_name("PATH")
|
|
.value_hint(ValueHint::FilePath)
|
|
.help("Path to the tolerances JSON (optional)")
|
|
)
|
|
.arg(
|
|
Arg::new("format")
|
|
.short('f')
|
|
.long("format")
|
|
.value_name("FORMAT")
|
|
.default_value("text")
|
|
.help("Output format (text, json)")
|
|
)
|
|
);
|
|
|
|
// sdk subcommand
|
|
let mut sdk_cmd = Command::new("sdk")
|
|
.about("SDK code generation commands");
|
|
|
|
sdk_cmd = sdk_cmd.subcommand(
|
|
Command::new("codegen")
|
|
.about("Generate SDK skeleton from templates")
|
|
.arg(
|
|
Arg::new("lang")
|
|
.short('l')
|
|
.long("lang")
|
|
.value_name("LANG")
|
|
.required(true)
|
|
.help("Target language")
|
|
)
|
|
.arg(
|
|
Arg::new("out")
|
|
.short('o')
|
|
.long("out")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.required(true)
|
|
.help("Output directory")
|
|
)
|
|
.arg(
|
|
Arg::new("version")
|
|
.short('v')
|
|
.long("version")
|
|
.value_name("VERSION")
|
|
.default_value("0.1.0")
|
|
.help("Version string (defaults to current pdftract version)")
|
|
)
|
|
);
|
|
|
|
sdk_cmd = sdk_cmd.subcommand(
|
|
Command::new("validate")
|
|
.about("Validate existing SDK against current generator output")
|
|
.arg(
|
|
Arg::new("lang")
|
|
.short('l')
|
|
.long("lang")
|
|
.value_name("LANG")
|
|
.required(true)
|
|
.help("Target language")
|
|
)
|
|
.arg(
|
|
Arg::new("sdk_dir")
|
|
.short('d')
|
|
.long("sdk-dir")
|
|
.value_name("DIR")
|
|
.value_hint(ValueHint::DirPath)
|
|
.required(true)
|
|
.help("Path to existing SDK directory")
|
|
)
|
|
);
|
|
|
|
cmd = cmd.subcommand(sdk_cmd);
|
|
|
|
// list-diagnostics subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("list-diagnostics")
|
|
.about("List all diagnostic codes with their metadata")
|
|
.long_about(
|
|
"List all diagnostic codes emitted during PDF parsing and extraction.\n\
|
|
Each diagnostic includes severity, recoverable flag, phase origin,\n\
|
|
and suggested action."
|
|
)
|
|
);
|
|
|
|
// explain-diagnostic subcommand
|
|
cmd = cmd.subcommand(
|
|
Command::new("explain-diagnostic")
|
|
.about("Explain a specific diagnostic code in detail")
|
|
.arg(
|
|
Arg::new("code")
|
|
.value_name("CODE")
|
|
.required(true)
|
|
.help("Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB)")
|
|
)
|
|
);
|
|
|
|
// Generate markdown using clap-markdown
|
|
// clap-markdown 0.1 uses a CommandFactory trait, so we need to capture stdout
|
|
let mut buffer = String::new();
|
|
buffer.push_str("# CLI Reference\n\n");
|
|
buffer.push_str("This page provides comprehensive documentation for all pdftract CLI commands and flags.\n\n");
|
|
buffer.push_str("## Usage\n\n");
|
|
buffer.push_str("```bash\npdftract [OPTIONS] <COMMAND>\n```\n\n");
|
|
buffer.push_str("## Global Options\n\n");
|
|
buffer.push_str("These options are available across all subcommands:\n\n");
|
|
buffer.push_str("- `-h, --help` - Print help information\n");
|
|
buffer.push_str("- `-V, --version` - Print version information\n\n");
|
|
buffer.push_str("## Commands\n\n");
|
|
|
|
// Use clap-markdown's CommandFactory API
|
|
// Since the cmd we built implements Command, we need to convert it
|
|
// clap-markdown 0.1 expects to call .command() on a CommandFactory type
|
|
// We'll manually generate the markdown for our custom command
|
|
|
|
fn command_to_markdown(cmd: &Command, depth: usize) -> String {
|
|
let mut result = String::new();
|
|
let indent = " ".repeat(depth * 2);
|
|
|
|
// Command name and description
|
|
if depth == 0 {
|
|
result.push_str(&format!("### `{}`\n\n", cmd.get_name()));
|
|
} else {
|
|
result.push_str(&format!("{}#### `{}`\n\n", indent, cmd.get_name()));
|
|
}
|
|
|
|
// About
|
|
if let Some(about) = cmd.get_about() {
|
|
result.push_str(&format!("{}\n\n", about));
|
|
}
|
|
|
|
// Long about
|
|
if let Some(long_about) = cmd.get_long_about() {
|
|
if let Some(about) = cmd.get_about() {
|
|
if long_about != about {
|
|
result.push_str(&format!("{}\n\n", long_about));
|
|
}
|
|
} else {
|
|
result.push_str(&format!("{}\n\n", long_about));
|
|
}
|
|
}
|
|
|
|
// Usage
|
|
let mut usage = String::new();
|
|
usage.push_str(&cmd.get_name());
|
|
if let Some(subcommand) = cmd.get_subcommands().find(|s| s.get_name() == "help") {
|
|
// Skip help subcommand
|
|
}
|
|
result.push_str(&format!("**Usage:**\n\n```bash\npdftract {}\n```\n\n", usage));
|
|
|
|
// Arguments
|
|
let positional_args: Vec<_> = cmd.get_positionals()
|
|
.filter(|a| !a.is_hide_set())
|
|
.collect();
|
|
|
|
if !positional_args.is_empty() {
|
|
result.push_str("**Arguments:**\n\n");
|
|
for arg in positional_args {
|
|
result.push_str(&format!("- `<{}>`", arg.get_id()));
|
|
if let Some(help) = arg.get_help() {
|
|
result.push_str(&format!(" - {}", help));
|
|
}
|
|
if arg.is_required_set() {
|
|
result.push_str(" (required)");
|
|
}
|
|
result.push_str("\n");
|
|
}
|
|
result.push_str("\n");
|
|
}
|
|
|
|
// Options
|
|
let options: Vec<_> = cmd.get_opts()
|
|
.filter(|o| !o.is_hide_set())
|
|
.collect();
|
|
|
|
if !options.is_empty() {
|
|
result.push_str("**Options:**\n\n");
|
|
for opt in options {
|
|
let mut names = Vec::new();
|
|
if let Some(short) = opt.get_short() {
|
|
names.push(format!("-{}", short));
|
|
}
|
|
if let Some(long) = opt.get_long() {
|
|
names.push(format!("--{}", long));
|
|
}
|
|
result.push_str(&format!("- `{}`", names.join(", ")));
|
|
if let Some(value_name) = opt.get_value_names() {
|
|
result.push_str(&format!(" <{}>", value_name.join(" ")));
|
|
}
|
|
if let Some(help) = opt.get_help() {
|
|
result.push_str(&format!(" - {}", help));
|
|
}
|
|
if let Some(default) = opt.get_default_values().first() {
|
|
result.push_str(&format!(" (default: `{}`)", default.to_string_lossy()));
|
|
}
|
|
result.push_str("\n");
|
|
}
|
|
result.push_str("\n");
|
|
}
|
|
|
|
// Subcommands
|
|
let subcommands: Vec<_> = cmd.get_subcommands()
|
|
.filter(|s| !s.is_hide_set())
|
|
.collect();
|
|
|
|
for subcmd in subcommands {
|
|
result.push_str(&command_to_markdown(subcmd, depth + 1));
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
buffer.push_str(&command_to_markdown(&cmd, 0));
|
|
|
|
buffer
|
|
}
|