diff --git a/crates/pdftract-cli/Cargo.toml b/crates/pdftract-cli/Cargo.toml index a462eb6..c2792e4 100644 --- a/crates/pdftract-cli/Cargo.toml +++ b/crates/pdftract-cli/Cargo.toml @@ -45,6 +45,8 @@ hyper-util = { version = "0.1", features = ["full"] } image = "0.24" http-body-util = "0.1" humantime = "2.1" +indicatif = { version = "0.17", optional = true } +num_cpus = "1" libloading = { version = "0.8", optional = true } lzw = { workspace = true } multer = "3" @@ -90,7 +92,7 @@ mcp = [] # Inspector web viewer inspect = [] # Folder grep mode -grep = [] +grep = ["dep:indicatif"] # Content-addressed cache cache = [] # Visual citation receipts diff --git a/crates/pdftract-cli/src/grep.rs b/crates/pdftract-cli/src/grep.rs new file mode 100644 index 0000000..4da68ff --- /dev/null +++ b/crates/pdftract-cli/src/grep.rs @@ -0,0 +1,374 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use std::path::PathBuf; + +/// Progress reporting mode +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProgressMode { + /// Auto-detect: on if TTY, off otherwise + Auto, + /// Force on + On, + /// Force off + Off, +} + +/// Grep subcommand arguments +#[derive(Parser, Debug)] +pub struct GrepArgs { + /// Search pattern (literal string by default; regex with -E) + #[arg(value_name = "PATTERN")] + pub pattern: String, + + /// Paths to search (files, directories, or URLs; default: ".") + #[arg(value_name = "PATH", default_value = ".")] + pub paths: Vec, + + /// Recurse into directories (default: on when path is a directory) + #[arg(short, long)] + pub recursive: bool, + + /// Case-insensitive search + #[arg(short, long)] + pub ignore_case: bool, + + /// Treat PATTERN as a full regular expression + #[arg(short = 'E', long)] + pub extended_regexp: bool, + + /// Literal string match (default: on) + #[arg(short = 'F', long)] + pub fixed_strings: bool, + + /// Match on word boundaries + #[arg(short = 'w', long)] + pub word_regexp: bool, + + /// Invert match: print non-matching spans instead + #[arg(short = 'v', long)] + pub invert_match: bool, + + /// Print only filenames with at least one match + #[arg(short = 'l', long)] + pub files_with_matches: bool, + + /// Print match counts per file + #[arg(short = 'c', long)] + pub count: bool, + + /// Worker thread count (default: CPU count) + #[arg(short = 'j', long, value_name = "N")] + pub threads: Option, + + /// Run OCR on scanned pages too (slower) + #[arg(long)] + pub ocr: bool, + + /// JSON-Lines output (one match per line) + #[arg(long)] + pub json: bool, + + /// Write annotated PDFs to DIR/-highlighted.pdf + #[arg(long, value_name = "DIR")] + pub highlight: Option, + + /// Stop after N total matches + #[arg(long, value_name = "N")] + pub max_results: Option, + + /// Show progress bar (default: auto) + #[arg(long)] + pub progress: bool, + + /// Force-disable the progress bar + #[arg(long)] + pub no_progress: bool, + + /// Emit machine-readable progress events to stderr + #[arg(long)] + pub progress_json: bool, + + /// Suppress all output except exit code + #[arg(long)] + pub quiet: bool, +} + +impl GrepArgs { + /// Get the progress mode based on flags and TTY detection + pub fn progress_mode(&self) -> ProgressMode { + if self.progress_json { + // JSON progress events don't use the progress bar + return ProgressMode::Off; + } + if self.no_progress { + return ProgressMode::Off; + } + if self.progress { + return ProgressMode::On; + } + ProgressMode::Auto + } + + /// Validate the arguments and return normalized values + pub fn validate(&self) -> Result { + // Check if the grep feature is enabled + #[cfg(not(feature = "grep"))] + { + anyhow::bail!("feature 'grep' not compiled in. Build pdftract with: --features grep"); + } + + // Validate pattern is not empty + if self.pattern.is_empty() { + anyhow::bail!("PATTERN may not be empty"); + } + + // Validate pattern doesn't contain null byte + if self.pattern.contains('\0') { + anyhow::bail!("PATTERN may not contain null byte"); + } + + // Determine match mode (default: literal/-F) + // -E explicitly enables regex, -F explicitly enables literal + // When neither is set, default to literal (per plan and ripgrep compat) + let use_regex = self.extended_regexp && !self.fixed_strings; + + // Determine if recursion should be used + // Default: recursive if path is a directory (ripgrep compat) + let recursive = if self.paths.iter().any(|p| p.is_dir()) { + self.recursive || true // default to true for dirs + } else { + self.recursive + }; + + // Validate highlight directory + let highlight_dir = if let Some(ref dir) = self.highlight { + if !dir.exists() { + std::fs::create_dir_all(dir).with_context(|| { + format!("Failed to create highlight directory: {}", dir.display()) + })?; + } + Some(dir.clone()) + } else { + None + }; + + // Determine thread count + let threads = self.threads.unwrap_or_else(num_cpus::get); + + Ok(GrepConfig { + pattern: self.pattern.clone(), + paths: self.paths.clone(), + recursive, + ignore_case: self.ignore_case, + use_regex, + word_regexp: self.word_regexp, + invert_match: self.invert_match, + files_with_matches: self.files_with_matches, + count: self.count, + threads, + ocr: self.ocr, + json: self.json, + highlight_dir, + max_results: self.max_results, + progress_mode: self.progress_mode(), + progress_json: self.progress_json, + quiet: self.quiet, + }) + } +} + +/// Normalized grep configuration after validation +#[derive(Debug, Clone)] +pub struct GrepConfig { + pub pattern: String, + pub paths: Vec, + pub recursive: bool, + pub ignore_case: bool, + pub use_regex: bool, + pub word_regexp: bool, + pub invert_match: bool, + pub files_with_matches: bool, + pub count: bool, + pub threads: usize, + pub ocr: bool, + pub json: bool, + pub highlight_dir: Option, + pub max_results: Option, + pub progress_mode: ProgressMode, + pub progress_json: bool, + pub quiet: bool, +} + +/// Run the grep command +pub fn run_grep(args: GrepArgs) -> Result<()> { + // Validate and normalize arguments + let config = args.validate()?; + + // For now, just print the configuration + // TODO: Implement the actual grep logic in subsequent beads (7.8.2-7.8.10) + if !config.quiet { + eprintln!("pdftract grep: mode not yet implemented"); + eprintln!("Pattern: {}", config.pattern); + eprintln!("Paths: {:?}", config.paths); + eprintln!( + "Match mode: {}", + if config.use_regex { "regex" } else { "literal" } + ); + eprintln!("Case-insensitive: {}", config.ignore_case); + eprintln!("Word boundaries: {}", config.word_regexp); + eprintln!("Invert match: {}", config.invert_match); + } + + std::process::exit(2); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_args(args: &[&str]) -> Result { + let args = GrepArgs::parse_from(args); + args.validate() + } + + #[test] + fn test_default_literal_mode() { + let config = parse_args(&["grep", "test"]).unwrap(); + assert!(!config.use_regex, "default should be literal mode"); + assert_eq!(config.pattern, "test"); + assert_eq!(config.paths, vec![PathBuf::from(".")]); + } + + #[test] + fn test_extended_regex_mode() { + let config = parse_args(&["grep", "-E", r"\d+"]).unwrap(); + assert!(config.use_regex, "-E should enable regex mode"); + assert_eq!(config.pattern, r"\d+"); + } + + #[test] + fn test_fixed_strings_mode() { + let config = parse_args(&["grep", "-F", "test"]).unwrap(); + assert!(!config.use_regex, "-F should enable literal mode"); + assert_eq!(config.pattern, "test"); + } + + #[test] + fn test_ignore_case() { + let config = parse_args(&["grep", "-i", "test"]).unwrap(); + assert!(config.ignore_case, "-i should enable case-insensitive"); + } + + #[test] + fn test_word_regexp() { + let config = parse_args(&["grep", "-w", "test"]).unwrap(); + assert!(config.word_regexp, "-w should enable word boundaries"); + } + + #[test] + fn test_invert_match() { + let config = parse_args(&["grep", "-v", "test"]).unwrap(); + assert!(config.invert_match, "-v should enable invert match"); + } + + #[test] + fn test_files_with_matches() { + let config = parse_args(&["grep", "-l", "test"]).unwrap(); + assert!( + config.files_with_matches, + "-l should enable files-with-matches" + ); + } + + #[test] + fn test_count() { + let config = parse_args(&["grep", "-c", "test"]).unwrap(); + assert!(config.count, "-c should enable count mode"); + } + + #[test] + fn test_json_output() { + let config = parse_args(&["grep", "--json", "test"]).unwrap(); + assert!(config.json, "--json should enable JSON output"); + } + + #[test] + fn test_ocr_flag() { + let config = parse_args(&["grep", "--ocr", "test"]).unwrap(); + assert!(config.ocr, "--ocr should enable OCR"); + } + + #[test] + fn test_quiet_flag() { + let config = parse_args(&["grep", "--quiet", "test"]).unwrap(); + assert!(config.quiet, "--quiet should suppress output"); + } + + #[test] + fn test_empty_pattern_rejected() { + let result = parse_args(&["grep", ""]); + assert!(result.is_err(), "empty pattern should be rejected"); + } + + #[test] + fn test_null_byte_pattern_rejected() { + let result = parse_args(&["grep", "test\0pattern"]); + assert!(result.is_err(), "null byte in pattern should be rejected"); + } + + #[test] + fn test_progress_mode_auto() { + let config = parse_args(&["grep", "test"]).unwrap(); + assert_eq!(config.progress_mode, ProgressMode::Auto); + } + + #[test] + fn test_progress_mode_on() { + let config = parse_args(&["grep", "--progress", "test"]).unwrap(); + assert_eq!(config.progress_mode, ProgressMode::On); + } + + #[test] + fn test_progress_mode_off() { + let config = parse_args(&["grep", "--no-progress", "test"]).unwrap(); + assert_eq!(config.progress_mode, ProgressMode::Off); + } + + #[test] + fn test_progress_json_disables_bar() { + let config = parse_args(&["grep", "--progress-json", "test"]).unwrap(); + assert_eq!(config.progress_mode, ProgressMode::Off); + assert!(config.progress_json); + } + + #[test] + fn test_recursive_default_for_directory() { + let config = parse_args(&["grep", "test", "/tmp"]).unwrap(); + assert!( + config.recursive, + "should default to recursive for directory paths" + ); + } + + #[test] + fn test_threads_default() { + let config = parse_args(&["grep", "test"]).unwrap(); + assert_eq!( + config.threads, + num_cpus::get(), + "default threads should be CPU count" + ); + } + + #[test] + fn test_threads_custom() { + let config = parse_args(&["grep", "-j", "4", "test"]).unwrap(); + assert_eq!(config.threads, 4); + } + + #[test] + fn test_max_results() { + let config = parse_args(&["grep", "--max-results", "100", "test"]).unwrap(); + assert_eq!(config.max_results, Some(100)); + } +} diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index b5d3562..03f9c32 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -6,6 +6,7 @@ use std::path::PathBuf; mod cache_cmd; mod codegen; mod doctor; +mod grep; mod mcp; mod password; mod serve; @@ -114,6 +115,8 @@ enum Commands { #[arg(long)] md_anchors: bool, }, + /// Search for text patterns in PDF files with bounding-box results + Grep(grep::GrepArgs), /// Verify a receipt against a PDF file VerifyReceipt(verify_receipt::VerifyReceiptCommand), /// Manage the extraction cache @@ -366,6 +369,12 @@ fn main() -> Result<()> { std::process::exit(1); } } + Commands::Grep(args) => { + if let Err(e) = grep::run_grep(args) { + eprintln!("Error: {}", e); + std::process::exit(1); + } + } Commands::Cache { cache_command } => { if let Err(e) = cmd_cache(cache_command) { eprintln!("Error: {}", e); diff --git a/notes/pdftract-4xu46.md b/notes/pdftract-4xu46.md new file mode 100644 index 0000000..783543b --- /dev/null +++ b/notes/pdftract-4xu46.md @@ -0,0 +1,106 @@ +# pdftract-4xu46: 7.8.1 grep subcommand structure + clap parsing + ripgrep-style flag table + +## Summary + +Implemented the `pdftract grep` subcommand structure with clap-based argument parsing and ripgrep-style flag compatibility. + +## Changes Made + +### 1. Cargo.toml (crates/pdftract-cli/Cargo.toml) +- Added `indicatif = { version = "0.17", optional = true }` dependency +- Added `num_cpus = "1"` dependency +- Updated `grep` feature to include `dep:indicatif` + +### 2. main.rs (crates/pdftract-cli/src/main.rs) +- Added `mod grep;` declaration +- Added `Grep(grep::GrepArgs)` variant to `Commands` enum +- Added handler for `Commands::Grep(args)` in main() + +### 3. grep.rs (crates/pdftract-cli/src/grep.rs) - NEW FILE +- Created `ProgressMode` enum (Auto/On/Off) +- Created `GrepArgs` struct with clap derive macro supporting: + - Positional `PATTERN` argument + - Variadic `PATH...` arguments (default: ".") + - `-r/--recursive` flag + - `-i/--ignore-case` flag + - `-E/--extended-regexp` flag + - `-F/--fixed-strings` flag (default: literal mode) + - `-w/--word-regexp` flag + - `-v/--invert-match` flag + - `-l/--files-with-matches` flag + - `-c/--count` flag + - `-j/--threads N` flag + - `--ocr` flag + - `--json` flag + - `--highlight DIR` flag + - `--max-results N` flag + - `--progress` flag + - `--no-progress` flag + - `--progress-json` flag + - `--quiet` flag +- Implemented `GrepArgs::validate()` with: + - Feature-gate check (prints error if grep feature not compiled) + - Pattern validation (non-empty, no null byte) + - Match mode determination (default: literal; -E enables regex; -F enables literal) + - Recursive detection (default: true for directory paths per ripgrep compat) + - Highlight directory validation and creation + - Thread count determination (default: CPU count) +- Created `GrepConfig` struct with normalized values +- Implemented stub `run_grep()` function (exits with code 2, prints config) + +## Acceptance Criteria Status + +- ✅ clap parses all flags from the plan table +- ✅ Default behavior matches ripgrep (literal by default, -i off, -r implicit on dirs) +- ✅ Unit tests: every flag combination from the plan's Critical tests section +- ✅ Feature-off path: prints meaningful error +- ✅ Path expansion: . recurses by default; single-file PATH does not recurse + +## Test Results + +All 21 unit tests pass: +- test_default_literal_mode: PASSED +- test_extended_regex_mode: PASSED +- test_fixed_strings_mode: PASSED +- test_ignore_case: PASSED +- test_word_regexp: PASSED +- test_invert_match: PASSED +- test_files_with_matches: PASSED +- test_count: PASSED +- test_json_output: PASSED +- test_ocr_flag: PASSED +- test_quiet_flag: PASSED +- test_empty_pattern_rejected: PASSED +- test_null_byte_pattern_rejected: PASSED +- test_progress_mode_auto: PASSED +- test_progress_mode_on: PASSED +- test_progress_mode_off: PASSED +- test_progress_json_disables_bar: PASSED +- test_recursive_default_for_directory: PASSED +- test_threads_default: PASSED +- test_threads_custom: PASSED +- test_max_results: PASSED + +## Verification Commands + +```bash +# Test help output +cargo run --bin pdftract --features grep -- grep --help + +# Test default literal mode +cargo run --bin pdftract --features grep -- grep "test" + +# Test feature-off error +cargo run --bin pdftract --no-default-features -- grep "test" 2>&1 | grep "feature 'grep' not compiled in" + +# Run tests +cargo test -p pdftract-cli --features grep --bin pdftract grep +``` + +## Notes + +- The grep subcommand is fully parsed but not yet implemented (stub exits with code 2) +- Subsequent beads (7.8.2-7.8.10) will implement the actual grep logic +- The `run_grep()` stub prints configuration for debugging +- Flag defaults follow ripgrep semantics for muscle-memory compatibility +- Default match mode is literal (not regex) per plan specification