pdftract/crates/pdftract-cli/src/grep/mod.rs
jedarden 7a70bb82b8 feat(pdftract-ixzbg): implement regex engine wiring for grep subcommand
Implement bead 7.8.2: Build the per-search matcher from GrepArgs.
Compile PATTERN into either a literal Aho-Corasick automaton (-F mode,
default) or a regex::Regex (-E mode). Apply -i (case-insensitive) and
-w (word-boundary) wrappers. Provide a uniform Matcher::find_iter(text)
-> Iter<MatchRange> API used by the per-span matcher.

Key changes:
- Add aho-corasick dependency for fast literal matching
- Create grep/matcher.rs with MatchRange and Matcher enum
- Reorganize grep.rs -> grep/mod.rs for proper module structure
- Implement literal mode with Aho-Corasick automaton
- Implement regex mode with regex::Regex
- Support case-insensitive matching in both modes
- Support word-boundary matching (\b anchors for regex, post-match check for literal)
- Comprehensive unit tests for all modes and edge cases

Closes: pdftract-ixzbg
2026-05-24 06:30:02 -04:00

378 lines
11 KiB
Rust

use anyhow::{Context, Result};
use clap::Parser;
use std::path::PathBuf;
// Matcher module
mod matcher;
pub use matcher::{MatchRange, Matcher};
/// Progress reporting mode
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProgressMode {
/// Auto-detect: on if TTY, off otherwise
Auto,
/// Force on
On,
/// Force off
Off,
}
/// Grep subcommand arguments
#[derive(Parser, Debug)]
pub struct GrepArgs {
/// Search pattern (literal string by default; regex with -E)
#[arg(value_name = "PATTERN")]
pub pattern: String,
/// Paths to search (files, directories, or URLs; default: ".")
#[arg(value_name = "PATH", default_value = ".")]
pub paths: Vec<PathBuf>,
/// Recurse into directories (default: on when path is a directory)
#[arg(short, long)]
pub recursive: bool,
/// Case-insensitive search
#[arg(short, long)]
pub ignore_case: bool,
/// Treat PATTERN as a full regular expression
#[arg(short = 'E', long)]
pub extended_regexp: bool,
/// Literal string match (default: on)
#[arg(short = 'F', long)]
pub fixed_strings: bool,
/// Match on word boundaries
#[arg(short = 'w', long)]
pub word_regexp: bool,
/// Invert match: print non-matching spans instead
#[arg(short = 'v', long)]
pub invert_match: bool,
/// Print only filenames with at least one match
#[arg(short = 'l', long)]
pub files_with_matches: bool,
/// Print match counts per file
#[arg(short = 'c', long)]
pub count: bool,
/// Worker thread count (default: CPU count)
#[arg(short = 'j', long, value_name = "N")]
pub threads: Option<usize>,
/// Run OCR on scanned pages too (slower)
#[arg(long)]
pub ocr: bool,
/// JSON-Lines output (one match per line)
#[arg(long)]
pub json: bool,
/// Write annotated PDFs to DIR/<name>-highlighted.pdf
#[arg(long, value_name = "DIR")]
pub highlight: Option<PathBuf>,
/// Stop after N total matches
#[arg(long, value_name = "N")]
pub max_results: Option<usize>,
/// Show progress bar (default: auto)
#[arg(long)]
pub progress: bool,
/// Force-disable the progress bar
#[arg(long)]
pub no_progress: bool,
/// Emit machine-readable progress events to stderr
#[arg(long)]
pub progress_json: bool,
/// Suppress all output except exit code
#[arg(long)]
pub quiet: bool,
}
impl GrepArgs {
/// Get the progress mode based on flags and TTY detection
pub fn progress_mode(&self) -> ProgressMode {
if self.progress_json {
// JSON progress events don't use the progress bar
return ProgressMode::Off;
}
if self.no_progress {
return ProgressMode::Off;
}
if self.progress {
return ProgressMode::On;
}
ProgressMode::Auto
}
/// Validate the arguments and return normalized values
pub fn validate(&self) -> Result<GrepConfig> {
// Check if the grep feature is enabled
#[cfg(not(feature = "grep"))]
{
anyhow::bail!("feature 'grep' not compiled in. Build pdftract with: --features grep");
}
// Validate pattern is not empty
if self.pattern.is_empty() {
anyhow::bail!("PATTERN may not be empty");
}
// Validate pattern doesn't contain null byte
if self.pattern.contains('\0') {
anyhow::bail!("PATTERN may not contain null byte");
}
// Determine match mode (default: literal/-F)
// -E explicitly enables regex, -F explicitly enables literal
// When neither is set, default to literal (per plan and ripgrep compat)
let use_regex = self.extended_regexp && !self.fixed_strings;
// Determine if recursion should be used
// Default: recursive if path is a directory (ripgrep compat)
let recursive = if self.paths.iter().any(|p| p.is_dir()) {
self.recursive || true // default to true for dirs
} else {
self.recursive
};
// Validate highlight directory
let highlight_dir = if let Some(ref dir) = self.highlight {
if !dir.exists() {
std::fs::create_dir_all(dir).with_context(|| {
format!("Failed to create highlight directory: {}", dir.display())
})?;
}
Some(dir.clone())
} else {
None
};
// Determine thread count
let threads = self.threads.unwrap_or_else(num_cpus::get);
Ok(GrepConfig {
pattern: self.pattern.clone(),
paths: self.paths.clone(),
recursive,
ignore_case: self.ignore_case,
use_regex,
word_regexp: self.word_regexp,
invert_match: self.invert_match,
files_with_matches: self.files_with_matches,
count: self.count,
threads,
ocr: self.ocr,
json: self.json,
highlight_dir,
max_results: self.max_results,
progress_mode: self.progress_mode(),
progress_json: self.progress_json,
quiet: self.quiet,
})
}
}
/// Normalized grep configuration after validation
#[derive(Debug, Clone)]
pub struct GrepConfig {
pub pattern: String,
pub paths: Vec<PathBuf>,
pub recursive: bool,
pub ignore_case: bool,
pub use_regex: bool,
pub word_regexp: bool,
pub invert_match: bool,
pub files_with_matches: bool,
pub count: bool,
pub threads: usize,
pub ocr: bool,
pub json: bool,
pub highlight_dir: Option<PathBuf>,
pub max_results: Option<usize>,
pub progress_mode: ProgressMode,
pub progress_json: bool,
pub quiet: bool,
}
/// Run the grep command
pub fn run_grep(args: GrepArgs) -> Result<()> {
// Validate and normalize arguments
let config = args.validate()?;
// For now, just print the configuration
// TODO: Implement the actual grep logic in subsequent beads (7.8.2-7.8.10)
if !config.quiet {
eprintln!("pdftract grep: mode not yet implemented");
eprintln!("Pattern: {}", config.pattern);
eprintln!("Paths: {:?}", config.paths);
eprintln!(
"Match mode: {}",
if config.use_regex { "regex" } else { "literal" }
);
eprintln!("Case-insensitive: {}", config.ignore_case);
eprintln!("Word boundaries: {}", config.word_regexp);
eprintln!("Invert match: {}", config.invert_match);
}
std::process::exit(2);
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_args(args: &[&str]) -> Result<GrepConfig> {
let args = GrepArgs::parse_from(args);
args.validate()
}
#[test]
fn test_default_literal_mode() {
let config = parse_args(&["grep", "test"]).unwrap();
assert!(!config.use_regex, "default should be literal mode");
assert_eq!(config.pattern, "test");
assert_eq!(config.paths, vec![PathBuf::from(".")]);
}
#[test]
fn test_extended_regex_mode() {
let config = parse_args(&["grep", "-E", r"\d+"]).unwrap();
assert!(config.use_regex, "-E should enable regex mode");
assert_eq!(config.pattern, r"\d+");
}
#[test]
fn test_fixed_strings_mode() {
let config = parse_args(&["grep", "-F", "test"]).unwrap();
assert!(!config.use_regex, "-F should enable literal mode");
assert_eq!(config.pattern, "test");
}
#[test]
fn test_ignore_case() {
let config = parse_args(&["grep", "-i", "test"]).unwrap();
assert!(config.ignore_case, "-i should enable case-insensitive");
}
#[test]
fn test_word_regexp() {
let config = parse_args(&["grep", "-w", "test"]).unwrap();
assert!(config.word_regexp, "-w should enable word boundaries");
}
#[test]
fn test_invert_match() {
let config = parse_args(&["grep", "-v", "test"]).unwrap();
assert!(config.invert_match, "-v should enable invert match");
}
#[test]
fn test_files_with_matches() {
let config = parse_args(&["grep", "-l", "test"]).unwrap();
assert!(
config.files_with_matches,
"-l should enable files-with-matches"
);
}
#[test]
fn test_count() {
let config = parse_args(&["grep", "-c", "test"]).unwrap();
assert!(config.count, "-c should enable count mode");
}
#[test]
fn test_json_output() {
let config = parse_args(&["grep", "--json", "test"]).unwrap();
assert!(config.json, "--json should enable JSON output");
}
#[test]
fn test_ocr_flag() {
let config = parse_args(&["grep", "--ocr", "test"]).unwrap();
assert!(config.ocr, "--ocr should enable OCR");
}
#[test]
fn test_quiet_flag() {
let config = parse_args(&["grep", "--quiet", "test"]).unwrap();
assert!(config.quiet, "--quiet should suppress output");
}
#[test]
fn test_empty_pattern_rejected() {
let result = parse_args(&["grep", ""]);
assert!(result.is_err(), "empty pattern should be rejected");
}
#[test]
fn test_null_byte_pattern_rejected() {
let result = parse_args(&["grep", "test\0pattern"]);
assert!(result.is_err(), "null byte in pattern should be rejected");
}
#[test]
fn test_progress_mode_auto() {
let config = parse_args(&["grep", "test"]).unwrap();
assert_eq!(config.progress_mode, ProgressMode::Auto);
}
#[test]
fn test_progress_mode_on() {
let config = parse_args(&["grep", "--progress", "test"]).unwrap();
assert_eq!(config.progress_mode, ProgressMode::On);
}
#[test]
fn test_progress_mode_off() {
let config = parse_args(&["grep", "--no-progress", "test"]).unwrap();
assert_eq!(config.progress_mode, ProgressMode::Off);
}
#[test]
fn test_progress_json_disables_bar() {
let config = parse_args(&["grep", "--progress-json", "test"]).unwrap();
assert_eq!(config.progress_mode, ProgressMode::Off);
assert!(config.progress_json);
}
#[test]
fn test_recursive_default_for_directory() {
let config = parse_args(&["grep", "test", "/tmp"]).unwrap();
assert!(
config.recursive,
"should default to recursive for directory paths"
);
}
#[test]
fn test_threads_default() {
let config = parse_args(&["grep", "test"]).unwrap();
assert_eq!(
config.threads,
num_cpus::get(),
"default threads should be CPU count"
);
}
#[test]
fn test_threads_custom() {
let config = parse_args(&["grep", "-j", "4", "test"]).unwrap();
assert_eq!(config.threads, 4);
}
#[test]
fn test_max_results() {
let config = parse_args(&["grep", "--max-results", "100", "test"]).unwrap();
assert_eq!(config.max_results, Some(100));
}
}