pdftract/crates/pdftract-cli/Cargo.toml
jedarden e2c1e2817b feat(pdftract-2i6rt): implement cache CLI subcommand and HTTP integration
This commit implements Phase 6.9.6: surfacing the cache as user-visible
CLI and HTTP affordances.

## Changes

- Add `pdftract cache` subcommand with stats/clear/purge actions
  - `stats DIR`: show entry count, size, hit ratio, age distribution
  - `stats DIR --json`: emit JSON with same fields
  - `clear DIR`: delete all entries (preserves index.json/sentinel)
  - `purge DIR --older-than 30d`: delete entries older than duration
  - `purge DIR --version '<1.0.0'`: version constraint purge (stub)

- Add global flags to extract-style subcommands
  - `--cache-dir DIR`: enable cache at directory
  - `--cache-size SIZE`: set LRU size limit (default 1 GiB)
  - `--no-cache`: disable cache for this call

- Add `X-Pdftract-Cache: hit|miss|skipped` HTTP header on /extract endpoints
  - Set in response headers before body streaming

- Add JSON metadata fields
  - `metadata.cache_status`: "hit" | "miss" | "skipped"
  - `metadata.cache_age_seconds`: integer seconds (present only on hit)

## Acceptance Criteria

-  pdftract cache stats on empty dir: "Entries: 0"
-  pdftract cache stats on populated dir: correct counts and ratios
-  pdftract cache clear -y: deletes entries, preserves index/sentinel
-  pdftract cache purge --older-than: deletes old entries
-  extract --cache-dir: metadata.cache_status populated
-  extract second run: cache_status "hit" with age
-  extract --no-cache: cache_status "skipped"
-  HTTP serve: X-Pdftract-Cache header present
-  --cache-size parsing: 4GiB → 4 * 1024^3 bytes

## Modules

- crates/pdftract-cli/src/cache_cmd.rs: subcommand implementation
- crates/pdftract-cli/src/serve.rs: HTTP handler integration
- crates/pdftract-cli/src/main.rs: CLI flag definitions
- crates/pdftract-core/src/cache/mod.rs: extract_with_cache() integration
- crates/pdftract-core/src/extract.rs: cache_status metadata fields

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 06:33:43 -04:00

64 lines
1.7 KiB
TOML

[package]
name = "pdftract-cli"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
repository.workspace = true
publish = true
[[bin]]
name = "pdftract"
path = "src/main.rs"
test = true
[[bin]]
name = "generate_lzw_fixtures"
path = "../../tests/fixtures/generate_lzw_fixtures_main.rs"
[lib]
name = "pdftract_cli"
path = "src/lib.rs"
default-run = "pdftract"
[dependencies]
anyhow = { workspace = true }
atty = "0.2"
async-stream = "0.3"
axum = { version = "0.7", features = ["json", "multipart"] }
bytes = "1"
chrono = { version = "0.4", features = ["serde"] }
clap = { version = "4.5", features = ["derive"] }
hyper = { version = "1.0", features = ["full"] }
hyper-util = { version = "0.1", features = ["full"] }
http-body-util = "0.1"
humantime = "2.1"
lzw = { workspace = true }
multer = "3"
pdftract-core = { path = "../pdftract-core" }
regex = "1.10"
secrecy = { workspace = true }
semver = "1.0"
serde = { workspace = true, features = ["derive"] }
subtle = "2.6"
sha2 = "0.10"
serde_json = "1.0"
schemars = { version = "0.8", features = ["derive"] }
tempfile = "3"
tera = "1"
tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
tower = { version = "0.5", features = ["full"] }
tower-http = { version = "0.5", features = ["cors", "trace", "limit", "compression-full"] }
tracing = { workspace = true }
uuid = { version = "1.0", features = ["v4", "serde"] }
walkdir = "2"
[target.'cfg(unix)'.dependencies]
libc = "0.2"
[dev-dependencies]
jsonschema = "0.18"
reqwest = { version = "0.12", features = ["blocking", "json", "rustls-tls"], default-features = false }
schemars = { version = "0.8", features = ["derive"] }