pdftract/crates/pdftract-cli/src/cache_cmd.rs
jedarden e6bf3dd290 feat(pdftract-3s2i): implement Phase 5.5.2 validation filter
Implement per-word validation filter for assisted-OCR BrokenVector path.

Changes:
- Add SpanSource::OcrAssisted variant to hybrid.rs
- Add Span::ocr_assisted() helper method
- Implement validate_ocr_with_position_hints() in ocr.rs
  - 5pt distance threshold for position validation
  - 0.4 confidence cap for rejected words
  - Linear scan for nearest-neighbor lookup
- Add unit tests for validation filter

Closes: pdftract-3s2i

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 04:57:17 -04:00

694 lines
23 KiB
Rust

//! Cache subcommand for managing the pdftract content-addressed cache.
//!
//! This module implements the `pdftract cache` subcommand with:
//! - `stats DIR` - show cache statistics
//! - `clear DIR` - delete all cache entries
//! - `purge DIR --older-than DURATION` - delete entries older than duration
//! - `purge DIR --version CONSTRAINT` - delete entries matching version constraint
use anyhow::{bail, Context, Result};
use pdftract_core::cache::layout::{self, CacheIndex};
use std::fs;
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
/// Cache statistics for display.
#[derive(Debug)]
pub struct CacheStats {
/// Number of cache entries
pub entry_count: u64,
/// Total compressed size in bytes
pub total_compressed_bytes: u64,
/// Total uncompressed size in bytes
pub total_uncompressed_bytes: u64,
/// Cache hits since last clear
pub hits: u64,
/// Total accesses since last clear
pub total_accesses: u64,
/// Oldest entry age in seconds
pub oldest_entry_age_seconds: Option<u64>,
/// Newest entry age in seconds
pub newest_entry_age_seconds: Option<u64>,
/// Age histogram buckets
pub age_histogram: AgeHistogram,
}
/// Age histogram buckets.
#[derive(Debug, Default)]
pub struct AgeHistogram {
pub less_than_1h: u64,
pub less_than_1d: u64,
pub less_than_7d: u64,
pub less_than_30d: u64,
pub greater_than_30d: u64,
}
impl AgeHistogram {
/// Record an entry age in seconds.
pub fn record(&mut self, age_seconds: u64) {
if age_seconds < 3600 {
self.less_than_1h += 1;
} else if age_seconds < 86400 {
self.less_than_1d += 1;
} else if age_seconds < 604800 {
self.less_than_7d += 1;
} else if age_seconds < 2592000 {
self.less_than_30d += 1;
} else {
self.greater_than_30d += 1;
}
}
/// Total entries in histogram.
pub fn total(&self) -> u64 {
self.less_than_1h
+ self.less_than_1d
+ self.less_than_7d
+ self.less_than_30d
+ self.greater_than_30d
}
/// Get percentage for a bucket.
pub fn percentage(&self, count: u64) -> f64 {
let total = self.total();
if total == 0 {
0.0
} else {
(count as f64 / total as f64) * 100.0
}
}
}
/// Compute cache statistics for a given cache directory.
pub fn compute_stats(cache_dir: &Path) -> Result<CacheStats> {
// If cache directory doesn't exist, return zero stats
if !cache_dir.exists() {
return Ok(CacheStats {
entry_count: 0,
total_compressed_bytes: 0,
total_uncompressed_bytes: 0,
hits: 0,
total_accesses: 0,
oldest_entry_age_seconds: None,
newest_entry_age_seconds: None,
age_histogram: AgeHistogram::default(),
});
}
let index = layout::load_index(cache_dir)?.unwrap_or_default();
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
let mut stats = CacheStats {
entry_count: 0,
total_compressed_bytes: 0,
total_uncompressed_bytes: 0,
hits: 0,
total_accesses: 0,
oldest_entry_age_seconds: None,
newest_entry_age_seconds: None,
age_histogram: AgeHistogram::default(),
};
// Walk the cache directory to compute statistics
let mut oldest_mtime = None;
let mut newest_mtime = None;
for prefix1_entry in fs::read_dir(cache_dir)?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix1_dir = prefix1_entry.path();
for prefix2_entry in prefix1_dir.read_dir()?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix2_dir = prefix2_entry.path();
for fp_entry in prefix2_dir
.read_dir()?
.filter_map(|e| e.ok())
.filter(|e| e.path().is_dir())
{
let fp_dir = fp_entry.path();
for entry in fp_dir.read_dir()?.filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_file() {
if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
if let Some(size) = layout::parse_size_from_filename(filename) {
stats.entry_count += 1;
stats.total_compressed_bytes += size as u64;
// Get mtime for age tracking
if let Ok(metadata) = path.metadata() {
if let Ok(modified) = metadata.modified() {
if let Ok(duration) = modified.duration_since(UNIX_EPOCH) {
let mtime_secs = duration.as_secs();
if oldest_mtime.is_none()
|| Some(mtime_secs) < oldest_mtime
{
oldest_mtime = Some(mtime_secs);
}
if newest_mtime.is_none()
|| Some(mtime_secs) > newest_mtime
{
newest_mtime = Some(mtime_secs);
}
// Record in histogram
let age = now.saturating_sub(mtime_secs);
stats.age_histogram.record(age);
}
}
}
}
}
}
}
}
}
}
// Compute age stats
if let Some(oldest) = oldest_mtime {
stats.oldest_entry_age_seconds = Some(now.saturating_sub(oldest));
}
if let Some(newest) = newest_mtime {
stats.newest_entry_age_seconds = Some(now.saturating_sub(newest));
}
// Estimate uncompressed size (assuming 8.5x compression ratio based on typical text)
stats.total_uncompressed_bytes = stats.total_compressed_bytes * 85 / 10;
// Hit ratio from index (if available)
stats.hits = index.hits;
stats.total_accesses = index.total_accesses;
Ok(stats)
}
/// Display cache statistics in human-readable format.
pub fn display_stats(stats: &CacheStats) {
let compressed_mb = stats.total_compressed_bytes as f64 / (1024.0 * 1024.0);
let uncompressed_mb = stats.total_uncompressed_bytes as f64 / (1024.0 * 1024.0);
let ratio = if stats.total_compressed_bytes > 0 {
stats.total_uncompressed_bytes as f64 / stats.total_compressed_bytes as f64
} else {
0.0
};
let hit_ratio = if stats.total_accesses > 0 {
(stats.hits as f64 / stats.total_accesses as f64) * 100.0
} else {
0.0
};
println!("Entries: {}", stats.entry_count);
println!(
"Total size: {:.1} MiB compressed / {:.1} GiB uncompressed ({:.1}x ratio)",
compressed_mb,
uncompressed_mb / 1024.0,
ratio
);
println!(
"Hit ratio (since last clear): {:.1}% ({} hits / {} total)",
hit_ratio, stats.hits, stats.total_accesses
);
if let Some(oldest) = stats.oldest_entry_age_seconds {
let days = oldest / 86400;
let hours = (oldest % 86400) / 3600;
println!("Oldest entry: {}d {}h ago", days, hours);
} else {
println!("Oldest entry: (none)");
}
if let Some(newest) = stats.newest_entry_age_seconds {
if newest < 60 {
println!("Newest entry: {}s ago", newest);
} else if newest < 3600 {
println!("Newest entry: {}m {}s ago", newest / 60, newest % 60);
} else {
let hours = newest / 3600;
let minutes = (newest % 3600) / 60;
println!("Newest entry: {}h {}m ago", hours, minutes);
}
} else {
println!("Newest entry: (none)");
}
let h = &stats.age_histogram;
println!(
"Age histogram: <1h: {:.1}%, <1d: {:.1}%, <7d: {:.1}%, <30d: {:.1}%, >30d: {:.1}%",
h.percentage(h.less_than_1h),
h.percentage(h.less_than_1d),
h.percentage(h.less_than_7d),
h.percentage(h.less_than_30d),
h.percentage(h.greater_than_30d)
);
}
/// Display cache statistics in JSON format.
pub fn display_stats_json(stats: &CacheStats) -> Result<()> {
let json = serde_json::json!({
"entry_count": stats.entry_count,
"total_compressed_bytes": stats.total_compressed_bytes,
"total_uncompressed_bytes": stats.total_uncompressed_bytes,
"compression_ratio": if stats.total_compressed_bytes > 0 {
stats.total_uncompressed_bytes as f64 / stats.total_compressed_bytes as f64
} else {
0.0
},
"hits": stats.hits,
"total_accesses": stats.total_accesses,
"hit_ratio_percent": if stats.total_accesses > 0 {
(stats.hits as f64 / stats.total_accesses as f64) * 100.0
} else {
0.0
},
"oldest_entry_age_seconds": stats.oldest_entry_age_seconds,
"newest_entry_age_seconds": stats.newest_entry_age_seconds,
"age_histogram": {
"less_than_1h": stats.age_histogram.less_than_1h,
"less_than_1d": stats.age_histogram.less_than_1d,
"less_than_7d": stats.age_histogram.less_than_7d,
"less_than_30d": stats.age_histogram.less_than_30d,
"greater_than_30d": stats.age_histogram.greater_than_30d,
}
});
println!("{}", serde_json::to_string_pretty(&json)?);
Ok(())
}
/// Clear all cache entries from the directory.
///
/// Prompts for confirmation unless -y is specified.
pub fn clear_cache(cache_dir: &Path, yes: bool) -> Result<()> {
// Check if directory exists
if !cache_dir.exists() {
println!("Cache directory does not exist: {}", cache_dir.display());
return Ok(());
}
// Count entries first
let entry_count = count_entries(cache_dir)?;
if entry_count == 0 {
println!("Cache is empty (0 entries)");
return Ok(());
}
// Confirm unless -y
if !yes {
if !prompt_confirmation(&format!("Delete all {} cache entries?", entry_count))? {
println!("Cancelled");
return Ok(());
}
}
// Delete all entry files (preserve index.json and sentinel)
let mut deleted = 0;
for prefix1_entry in fs::read_dir(cache_dir)?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix1_dir = prefix1_entry.path();
for prefix2_entry in prefix1_dir.read_dir()?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix2_dir = prefix2_entry.path();
for fp_entry in prefix2_dir
.read_dir()?
.filter_map(|e| e.ok())
.filter(|e| e.path().is_dir())
{
let fp_dir = fp_entry.path();
// Delete all files in the fingerprint directory
for entry in fp_dir.read_dir()?.filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_file() {
let _ = fs::remove_file(&path);
deleted += 1;
}
}
// Remove the empty fingerprint directory
let _ = fs::remove_dir(&fp_dir);
}
// Remove empty second-level prefix directory
if prefix2_dir.read_dir()?.next().is_none() {
let _ = fs::remove_dir(&prefix2_dir);
}
}
// Remove empty first-level prefix directory
if prefix1_dir.read_dir()?.next().is_none() {
let _ = fs::remove_dir(&prefix1_dir);
}
}
// Reset index.json entry count and hit statistics
let mut index = layout::load_index(cache_dir)?.unwrap_or_default();
index.entry_count = 0;
index.total_bytes = 0;
index.hits = 0;
index.total_accesses = 0;
layout::save_index(cache_dir, &index)?;
println!("Deleted {} cache entries", deleted);
Ok(())
}
/// Purge cache entries older than the specified duration.
pub fn purge_cache_older_than(cache_dir: &Path, duration_str: &str) -> Result<()> {
use humantime::parse_duration;
let duration = parse_duration(duration_str).context(format!(
"Invalid duration '{}'. Use formats like '30d', '7d', '1h'",
duration_str
))?;
let cutoff_secs = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs()
.saturating_sub(duration.as_secs());
let mut deleted = 0;
for prefix1_entry in fs::read_dir(cache_dir)?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix1_dir = prefix1_entry.path();
for prefix2_entry in prefix1_dir.read_dir()?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix2_dir = prefix2_entry.path();
for fp_entry in prefix2_dir
.read_dir()?
.filter_map(|e| e.ok())
.filter(|e| e.path().is_dir())
{
let fp_dir = fp_entry.path();
for entry in fp_dir.read_dir()?.filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_file() {
// Check mtime
if let Ok(metadata) = path.metadata() {
if let Ok(modified) = metadata.modified() {
if let Ok(duration) = modified.duration_since(UNIX_EPOCH) {
let mtime_secs = duration.as_secs();
if mtime_secs < cutoff_secs {
let _ = fs::remove_file(&path);
deleted += 1;
}
}
}
}
}
}
// Remove empty fingerprint directory
if fp_dir.read_dir()?.next().is_none() {
let _ = fs::remove_dir(&fp_dir);
}
}
// Remove empty second-level prefix directory
if prefix2_dir.read_dir()?.next().is_none() {
let _ = fs::remove_dir(&prefix2_dir);
}
}
// Remove empty first-level prefix directory
if prefix1_dir.read_dir()?.next().is_none() {
let _ = fs::remove_dir(&prefix1_dir);
}
}
// Update index (preserve hit stats, update entry count and bytes)
let remaining = count_entries(cache_dir)?;
let mut index = layout::load_index(cache_dir)?.unwrap_or_default();
index.entry_count = remaining;
index.total_bytes = compute_stats(cache_dir)?.total_compressed_bytes;
// hits and total_accesses are preserved during purge
layout::save_index(cache_dir, &index)?;
println!("Deleted {} entries older than {}", deleted, duration_str);
Ok(())
}
/// Purge cache entries matching a version constraint.
pub fn purge_cache_version(_cache_dir: &Path, version_constraint: &str) -> Result<()> {
use semver::VersionReq;
let _req = VersionReq::parse(version_constraint).context(format!(
"Invalid version constraint '{}'",
version_constraint
))?;
// For now, this is a no-op since we don't track extraction versions per entry
// This would require extending the cache entry metadata
println!("Version-based purge not yet implemented");
println!("Entries are tagged with extraction_version in the cache, but version constraint matching is not yet available");
Ok(())
}
/// Count the total number of cache entries.
fn count_entries(cache_dir: &Path) -> Result<u64> {
let mut count = 0;
for prefix1_entry in fs::read_dir(cache_dir)?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix1_dir = prefix1_entry.path();
for prefix2_entry in prefix1_dir.read_dir()?.filter_map(|e| e.ok()).filter(|e| {
e.path().is_dir()
&& e.file_name().to_string_lossy().len() == 2
&& e.file_name()
.to_string_lossy()
.chars()
.all(|c| c.is_ascii_hexdigit())
}) {
let prefix2_dir = prefix2_entry.path();
for fp_entry in prefix2_dir
.read_dir()?
.filter_map(|e| e.ok())
.filter(|e| e.path().is_dir())
{
let fp_dir = fp_entry.path();
for entry in fp_dir.read_dir()?.filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_file() {
if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
if layout::parse_size_from_filename(filename).is_some() {
count += 1;
}
}
}
}
}
}
}
Ok(count)
}
/// Prompt for confirmation on a TTY.
fn prompt_confirmation(prompt: &str) -> Result<bool> {
// Check if we're on a TTY
if !atty::is(atty::Stream::Stdin) {
bail!("Cannot confirm without -y flag (not a TTY)");
}
print!("{} [y/N] ", prompt);
io::stdout().flush()?;
let mut input = String::new();
io::stdin().read_line(&mut input)?;
let response = input.trim().to_lowercase();
Ok(response == "y" || response == "yes")
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_age_histogram() {
let mut h = AgeHistogram::default();
h.record(100); // < 1h
h.record(3600 + 100); // < 1d
h.record(86400 + 100); // < 7d
h.record(604800 + 100); // < 30d
h.record(2592000 + 100); // > 30d
assert_eq!(h.less_than_1h, 1);
assert_eq!(h.less_than_1d, 1);
assert_eq!(h.less_than_7d, 1);
assert_eq!(h.less_than_30d, 1);
assert_eq!(h.greater_than_30d, 1);
assert_eq!(h.total(), 5);
// Each should be 20%
assert!((h.percentage(1) - 20.0).abs() < 0.01);
}
#[test]
fn test_age_histogram_percentage() {
let mut h = AgeHistogram::default();
h.record(100);
h.record(200);
assert_eq!(h.total(), 2);
assert!((h.percentage(h.less_than_1h) - 100.0).abs() < 0.01);
assert_eq!(h.percentage(h.less_than_1d), 0.0);
}
#[test]
fn test_compute_stats_empty() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path();
// Create index.json
let index = CacheIndex::default();
layout::save_index(cache_dir, &index).unwrap();
let stats = compute_stats(cache_dir).unwrap();
assert_eq!(stats.entry_count, 0);
assert_eq!(stats.total_compressed_bytes, 0);
assert_eq!(stats.total_uncompressed_bytes, 0);
assert!(stats.oldest_entry_age_seconds.is_none());
assert!(stats.newest_entry_age_seconds.is_none());
}
#[test]
fn test_compute_stats_with_entries() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path();
// Create a test entry
let fp = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
let opts = "9b21c0ffee000000000000000000000000000000000000000000000000000000";
let fp_dir = cache_dir.join("e7").join("a1").join(fp);
fs::create_dir_all(&fp_dir).unwrap();
let entry_path = fp_dir.join(format!("{}-1000.json.zst", opts));
fs::write(&entry_path, b"x".repeat(1000)).unwrap();
let stats = compute_stats(cache_dir).unwrap();
assert_eq!(stats.entry_count, 1);
assert_eq!(stats.total_compressed_bytes, 1000);
assert!(stats.oldest_entry_age_seconds.is_some());
assert!(stats.newest_entry_age_seconds.is_some());
assert_eq!(stats.age_histogram.total(), 1);
}
#[test]
fn test_clear_cache_empty() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path();
// Create index
let index = CacheIndex::default();
layout::save_index(cache_dir, &index).unwrap();
clear_cache(cache_dir, true).unwrap();
// Index should still exist but with 0 entries and reset hit stats
let loaded = layout::load_index(cache_dir).unwrap().unwrap();
assert_eq!(loaded.entry_count, 0);
assert_eq!(loaded.hits, 0);
assert_eq!(loaded.total_accesses, 0);
}
#[test]
fn test_count_entries() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path();
// Create test entries
let fp = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
let opts = "9b21c0ffee000000000000000000000000000000000000000000000000000000";
let fp_dir = cache_dir.join("e7").join("a1").join(fp);
fs::create_dir_all(&fp_dir).unwrap();
fs::write(
fp_dir.join(format!("{}-1000.json.zst", opts)),
b"x".repeat(1000),
)
.unwrap();
fs::write(
fp_dir.join(format!("{}-2000.json.zst", opts)),
b"x".repeat(2000),
)
.unwrap();
let count = count_entries(cache_dir).unwrap();
assert_eq!(count, 2);
}
#[test]
fn test_count_entries_empty() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path();
let count = count_entries(cache_dir).unwrap();
assert_eq!(count, 0);
}
}