From 624fc49290176018feb39524e8f8f03398b2eacf Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 04:40:05 -0400 Subject: [PATCH] feat(pdftract-172kr): implement filesystem layout for cache directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements Phase 6.9.1: the two-byte-prefix directory scheme that keeps any single directory under 65K entries even at millions of cached entries. Changes: - Add zstd dependency to Cargo.toml - Create cache module with layout.rs implementing path construction - Add CacheIndex struct for index.json metadata (schema version, timestamps) - Implement entry_path(), fingerprint_dir(), parse helpers - Add load_index()/save_index() for cache metadata persistence - Ensure mkdir -p semantics with ensure_fingerprint_dir() - 18 tests covering all acceptance criteria Acceptance criteria verified: ✓ entry_path produces correct two-level prefix layout ✓ Different opts_hashes for same fingerprint share fp_dir ✓ Different fingerprints with same prefix share first-level dir ✓ index.json round-trips with schema version check ✓ Future schema version rejects cache with clear error ✓ mkdir -p creates prefix dirs; idempotent on concurrent writes ✓ Unicode-correct path handling via std::path::PathBuf ✓ Path length stays under 4096 bytes Co-Authored-By: Claude Code --- Cargo.lock | 2 + crates/pdftract-core/Cargo.toml | 1 + crates/pdftract-core/src/cache/layout.rs | 543 +++++++++++++++++++++++ crates/pdftract-core/src/cache/mod.rs | 25 ++ crates/pdftract-core/src/lib.rs | 1 + 5 files changed, 572 insertions(+) create mode 100644 crates/pdftract-core/src/cache/layout.rs create mode 100644 crates/pdftract-core/src/cache/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8955419..fc4ccce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1503,6 +1503,7 @@ dependencies = [ "thiserror 1.0.69", "ttf-parser", "unicode-normalization", + "zstd", ] [[package]] @@ -1511,6 +1512,7 @@ version = "0.1.0" dependencies = [ "pdftract-core", "pyo3", + "serde_json", ] [[package]] diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index e327132..5d62bed 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -22,6 +22,7 @@ thiserror = { workspace = true } memchr = { workspace = true } unicode-normalization = { workspace = true } ttf-parser = "0.24" +zstd = "0.13" [features] default = ["serde"] diff --git a/crates/pdftract-core/src/cache/layout.rs b/crates/pdftract-core/src/cache/layout.rs new file mode 100644 index 0000000..2d982d9 --- /dev/null +++ b/crates/pdftract-core/src/cache/layout.rs @@ -0,0 +1,543 @@ +//! Filesystem layout for the content-addressed cache. +//! +//! This module implements the two-byte-prefix directory scheme that keeps +//! any single directory under 65K entries even at millions of cached entries. + +use std::path::{Path, PathBuf}; +use serde::{Deserialize, Serialize}; + +/// Current cache schema version. +/// +/// This gates layout migrations. On mismatch, the cache refuses to operate +/// and logs a clear migration message. +pub const CURRENT_SCHEMA_VERSION: u32 = 1; + +/// Fingerprint version prefix that must be stripped before path encoding. +const FINGERPRINT_PREFIX: &str = "pdftract-v1:"; + +/// Cache metadata stored in index.json. +/// +/// This file is read at startup and updated on shutdown. It tracks the +/// cache schema version, creation timestamp, and LRU sweep timing. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CacheIndex { + /// Cache schema version (current: 1) + pub schema_version: u32, + /// Creation timestamp (Unix seconds) + pub created_at: u64, + /// Last LRU sweep timestamp (Unix seconds) + pub last_lru_sweep: Option, + /// Total compressed bytes in the cache (rebuilt from disk on suspicion of corruption) + pub total_bytes: u64, + /// Number of cached entries + pub entry_count: u64, +} + +impl Default for CacheIndex { + fn default() -> Self { + Self { + schema_version: CURRENT_SCHEMA_VERSION, + created_at: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(), + last_lru_sweep: None, + total_bytes: 0, + entry_count: 0, + } + } +} + +/// Construct the path to a cached extraction entry. +/// +/// # Arguments +/// +/// * `cache_dir` - Root cache directory +/// * `fingerprint` - Full fingerprint string (e.g., "pdftract-v1:e7a1f3...") +/// * `opts_hash` - 64-char hex SHA-256 hash of extraction options +/// * `compressed_size` - Size of the compressed entry in bytes +/// +/// # Returns +/// +/// Path in the format `////-.json.zst` +/// +/// # Examples +/// +/// ```ignore +/// let path = entry_path( +/// Path::new("/cache"), +/// "pdftract-v1:e7a1f3deadbeef...", +/// "9b21c0ffee...", +/// 12387 +/// ); +/// assert_eq!(path, PathBuf::from("/cache/e7/a1/e7a1f3deadbeef.../9b21c0ffee...-12387.json.zst")); +/// ``` +pub fn entry_path( + cache_dir: &Path, + fingerprint: &str, + opts_hash: &str, + compressed_size: usize, +) -> PathBuf { + // Strip the "pdftract-v1:" prefix to get the raw hex fingerprint + let fp = fingerprint.strip_prefix(FINGERPRINT_PREFIX).unwrap_or(fingerprint); + + // Validate fingerprint is at least 4 chars (for the two-byte prefixes) + assert!( + fp.len() >= 4, + "Fingerprint must be at least 4 characters long, got: {}", + fp.len() + ); + + // Extract two-byte prefixes + let prefix1 = &fp[0..2]; + let prefix2 = &fp[2..4]; + + // Build the path: ////-.json.zst + cache_dir + .join(prefix1) + .join(prefix2) + .join(fp) + .join(format!("{opts_hash}-{compressed_size}.json.zst")) +} + +/// Construct the fingerprint directory path for a given fingerprint. +/// +/// This is the parent directory that contains all option variants for a specific PDF. +/// Useful for invalidating all cached results for a PDF (rm -rf ). +/// +/// # Arguments +/// +/// * `cache_dir` - Root cache directory +/// * `fingerprint` - Full fingerprint string (e.g., "pdftract-v1:e7a1f3...") +/// +/// # Returns +/// +/// Path in the format `///` +pub fn fingerprint_dir(cache_dir: &Path, fingerprint: &str) -> PathBuf { + let fp = fingerprint.strip_prefix(FINGERPRINT_PREFIX).unwrap_or(fingerprint); + assert!( + fp.len() >= 4, + "Fingerprint must be at least 4 characters long, got: {}", + fp.len() + ); + + let prefix1 = &fp[0..2]; + let prefix2 = &fp[2..4]; + + cache_dir.join(prefix1).join(prefix2).join(fp) +} + +/// Parse the opts_hash from a cache entry filename. +/// +/// Entry filenames are in the format `-.json.zst`. +/// This function extracts just the opts_hash part. +/// +/// # Arguments +/// +/// * `filename` - The entry filename (e.g., "e7a1f3-12387.json.zst") +/// +/// # Returns +/// +/// The opts_hash if the filename matches the expected format, None otherwise. +pub fn parse_opts_hash_from_filename(filename: &str) -> Option<&str> { + // Expected format: -.json.zst + // We need to extract everything before the first '-' that's followed by digits and '.json.zst' + + // Find the pattern: '-.json.zst' + let json_zst_suffix = ".json.zst"; + if !filename.ends_with(json_zst_suffix) { + return None; + } + + // Strip the suffix to get "-" + let rest = &filename[..filename.len() - json_zst_suffix.len()]; + + // Find the last '-' (separates opts_hash from size) + let separator_pos = rest.rfind('-')?; + let opts_hash = &rest[..separator_pos]; + + // opts_hash should be 64-char hex (SHA-256) + if opts_hash.len() == 64 && opts_hash.chars().all(|c| c.is_ascii_hexdigit()) { + Some(opts_hash) + } else { + None + } +} + +/// Parse the compressed size from a cache entry filename. +/// +/// Entry filenames are in the format `-.json.zst`. +/// This function extracts just the size part. +/// +/// # Arguments +/// +/// * `filename` - The entry filename (e.g., "e7a1f3-12387.json.zst") +/// +/// # Returns +/// +/// The compressed size if the filename matches the expected format, None otherwise. +pub fn parse_size_from_filename(filename: &str) -> Option { + let json_zst_suffix = ".json.zst"; + if !filename.ends_with(json_zst_suffix) { + return None; + } + + let rest = &filename[..filename.len() - json_zst_suffix.len()]; + let separator_pos = rest.rfind('-')?; + let size_str = &rest[separator_pos + 1..]; + + size_str.parse().ok() +} + +/// Path to the cache index.json file. +pub fn index_path(cache_dir: &Path) -> PathBuf { + cache_dir.join("index.json") +} + +/// Path to the LRU sentinel file. +pub fn sentinel_path(cache_dir: &Path) -> PathBuf { + cache_dir.join("sentinel.touched") +} + +/// Load the cache index from disk. +/// +/// Returns None if the index doesn't exist or is malformed. +/// Returns an error if the schema version doesn't match. +pub fn load_index(cache_dir: &Path) -> Result, anyhow::Error> { + let index_file = index_path(cache_dir); + + if !index_file.exists() { + return Ok(None); + } + + let contents = std::fs::read_to_string(&index_file)?; + let index: CacheIndex = serde_json::from_str(&contents)?; + + // Check schema version + if index.schema_version != CURRENT_SCHEMA_VERSION { + return Err(anyhow::anyhow!( + "Cache schema version mismatch: expected {}, got {}. \ + Please clear the cache with 'pdftract cache clear' and re-populate.", + CURRENT_SCHEMA_VERSION, index.schema_version + )); + } + + Ok(Some(index)) +} + +/// Save the cache index to disk. +pub fn save_index(cache_dir: &Path, index: &CacheIndex) -> Result<(), anyhow::Error> { + let index_file = index_path(cache_dir); + + // Ensure the cache directory exists + if let Some(parent) = index_file.parent() { + std::fs::create_dir_all(parent)?; + } + + let contents = serde_json::to_string_pretty(index)?; + std::fs::write(&index_file, contents)?; + + Ok(()) +} + +/// Ensure the fingerprint directory exists, creating it if necessary. +/// +/// This uses `mkdir -p` semantics and is race-safe (idempotent). +pub fn ensure_fingerprint_dir(cache_dir: &Path, fingerprint: &str) -> Result<(), std::io::Error> { + let fp_dir = fingerprint_dir(cache_dir, fingerprint); + std::fs::create_dir_all(fp_dir) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + const TEST_FINGERPRINT: &str = "pdftract-v1:e7a1f3deadbeef00000000000000000000000000000000000000000000000000"; + const TEST_FINGERPRINT_SHORT: &str = "pdftract-v1:e7a1"; + const TEST_OPTS_HASH: &str = "9b21c0ffee0000000000000000000000000000000000000000000000000000000"; + + #[test] + fn test_entry_path_basic() { + let cache_dir = Path::new("/cache"); + let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387); + + // Should be: /cache/e7/a1/e7a1f3.../9b21...-12387.json.zst + let expected = format!( + "/cache/e7/a1/e7a1f3deadbeef00000000000000000000000000000000000000000000000000/\ + 9b21c0ffee0000000000000000000000000000000000000000000000000000000-12387.json.zst" + ); + assert_eq!(path, PathBuf::from(expected)); + } + + #[test] + fn test_entry_path_different_opts_hashes() { + let cache_dir = Path::new("/cache"); + let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT); + + // Two different opts_hashes should produce entries in the same fp_dir + let path1 = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 100); + let path2 = entry_path( + cache_dir, + TEST_FINGERPRINT, + "aaaa000000000000000000000000000000000000000000000000000000000000aa", + 200, + ); + + // Both should have the same parent (the fingerprint directory) + assert_eq!(path1.parent(), Some(fp_dir.as_path())); + assert_eq!(path2.parent(), Some(fp_dir.as_path())); + + // But different filenames + assert_ne!( + path1.file_name(), + path2.file_name() + ); + } + + #[test] + fn test_entry_path_different_fingerprints_same_prefix() { + let cache_dir = Path::new("/cache"); + + // Two fingerprints with the same first 2 chars share the first-level directory + let fp1 = "pdftract-v1:e7a1f3deadbeef00000000000000000000000000000000000000000000000000"; + let fp2 = "pdftract-v1:e7b2f4deadbeef00000000000000000000000000000000000000000000000000"; + + let path1 = entry_path(cache_dir, fp1, TEST_OPTS_HASH, 100); + let path2 = entry_path(cache_dir, fp2, TEST_OPTS_HASH, 100); + + // Both should have the same first-level directory (e7) + // Check via components: skip root + cache, first prefix is e7 + let mut components1 = path1.components().skip(2); + let mut components2 = path2.components().skip(2); + assert_eq!(components1.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7")))); + assert_eq!(components2.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7")))); + + // But different second-level directories + assert_eq!(components1.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1")))); + assert_eq!(components2.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("b2")))); + } + + #[test] + fn test_fingerprint_dir() { + let cache_dir = Path::new("/cache"); + let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT); + + let expected = "/cache/e7/a1/e7a1f3deadbeef00000000000000000000000000000000000000000000000000"; + assert_eq!(fp_dir, PathBuf::from(expected)); + } + + #[test] + fn test_entry_path_short_fingerprint() { + let cache_dir = Path::new("/cache"); + let path = entry_path(cache_dir, TEST_FINGERPRINT_SHORT, TEST_OPTS_HASH, 12387); + + // Should use the available chars: e7/a1/e7a1/... + let mut components = path.components().skip(2); + assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7")))); + assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1")))); + } + + #[test] + fn test_parse_opts_hash_from_filename() { + // Valid filename + let filename = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-12387.json.zst"; + let opts_hash = parse_opts_hash_from_filename(filename); + assert_eq!( + opts_hash, + Some("e7a1f3deadbeef00000000000000000000000000000000000000000000000000") + ); + + // Invalid: wrong suffix + assert!(parse_opts_hash_from_filename("e7a1f3-12387.json").is_none()); + + // Invalid: no size part + assert!(parse_opts_hash_from_filename("e7a1f3.json.zst").is_none()); + + // Invalid: opts_hash too short + assert!(parse_opts_hash_from_filename("abc-12387.json.zst").is_none()); + } + + #[test] + fn test_parse_size_from_filename() { + let filename = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-12387.json.zst"; + let size = parse_size_from_filename(filename); + assert_eq!(size, Some(12387)); + + // Different size + let filename2 = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-999.json.zst"; + let size2 = parse_size_from_filename(filename2); + assert_eq!(size2, Some(999)); + + // Invalid format + assert!(parse_size_from_filename("e7a1f3.json.zst").is_none()); + assert!(parse_size_from_filename("e7a1f3-abc.json.zst").is_none()); + } + + #[test] + fn test_index_roundtrip() { + let temp_dir = TempDir::new().unwrap(); + let cache_dir = temp_dir.path(); + + // Create an index + let index = CacheIndex { + schema_version: CURRENT_SCHEMA_VERSION, + created_at: 1234567890, + last_lru_sweep: Some(1234567900), + total_bytes: 1024000, + entry_count: 42, + }; + + // Save it + save_index(cache_dir, &index).unwrap(); + + // Load it back + let loaded = load_index(cache_dir).unwrap().unwrap(); + + assert_eq!(loaded.schema_version, CURRENT_SCHEMA_VERSION); + assert_eq!(loaded.created_at, 1234567890); + assert_eq!(loaded.last_lru_sweep, Some(1234567900)); + assert_eq!(loaded.total_bytes, 1024000); + assert_eq!(loaded.entry_count, 42); + } + + #[test] + fn test_index_default() { + let index = CacheIndex::default(); + assert_eq!(index.schema_version, CURRENT_SCHEMA_VERSION); + assert_eq!(index.last_lru_sweep, None); + assert_eq!(index.total_bytes, 0); + assert_eq!(index.entry_count, 0); + // created_at should be recent (within last 10 seconds) + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + assert!(now - index.created_at < 10); + } + + #[test] + fn test_index_schema_version_mismatch() { + let temp_dir = TempDir::new().unwrap(); + let cache_dir = temp_dir.path(); + + // Create an index with a future schema version + let index = CacheIndex { + schema_version: 99, // Future version + created_at: 1234567890, + last_lru_sweep: None, + total_bytes: 0, + entry_count: 0, + }; + + save_index(cache_dir, &index).unwrap(); + + // Loading should fail with a clear error message + let result = load_index(cache_dir); + assert!(result.is_err()); + let err = result.unwrap_err(); + let err_msg = err.to_string(); + assert!(err_msg.contains("schema version mismatch")); + assert!(err_msg.contains("expected 1")); + assert!(err_msg.contains("got 99")); + } + + #[test] + fn test_index_not_exists() { + let temp_dir = TempDir::new().unwrap(); + let cache_dir = temp_dir.path(); + + // Loading when index doesn't exist should return Ok(None) + let result = load_index(cache_dir).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_ensure_fingerprint_dir() { + let temp_dir = TempDir::new().unwrap(); + let cache_dir = temp_dir.path(); + + // Ensure the directory is created + ensure_fingerprint_dir(cache_dir, TEST_FINGERPRINT).unwrap(); + + let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT); + assert!(fp_dir.exists()); + assert!(fp_dir.is_dir()); + + // Calling again should be idempotent + ensure_fingerprint_dir(cache_dir, TEST_FINGERPRINT).unwrap(); + assert!(fp_dir.exists()); + } + + #[test] + fn test_path_length_within_limits() { + let cache_dir = Path::new("/a/very/long/cache/directory/path/that/goes/on/and/on"); + let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387); + + // Convert to string and check length + let path_str = path.to_str().unwrap(); + // POSIX max path length is typically 4096 + assert!(path_str.len() < 4096, "Path length {} exceeds 4096", path_str.len()); + + // Our paths should be much shorter in practice + // Typical case: /cache + 2 + 2 + 64 + 64 + ~20 = ~154 bytes + } + + #[test] + fn test_unicode_path_handling() { + // Test that PathBuf handles unicode correctly on all platforms + let cache_dir = Path::new("/café"); + let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387); + + // Path should be constructible + let path_str = path.to_str(); + assert!(path_str.is_some()); + + // On Windows, this would use wide characters; on Unix, UTF-8 + // Either way, PathBuf handles it correctly + } + + #[test] + fn test_fingerprint_without_prefix() { + // If fingerprint doesn't have the prefix, use it as-is + let cache_dir = Path::new("/cache"); + let bare_fp = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000"; + let path = entry_path(cache_dir, bare_fp, TEST_OPTS_HASH, 12387); + + // Should still work: /cache/e7/a1/e7a1f3... + let mut components = path.components().skip(2); + assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7")))); + assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1")))); + } + + #[test] + fn test_entry_path_zero_size() { + let cache_dir = Path::new("/cache"); + let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 0); + + let filename = path.file_name().unwrap().to_str().unwrap(); + assert!(filename.ends_with("-0.json.zst")); + } + + #[test] + fn test_entry_path_large_size() { + let cache_dir = Path::new("/cache"); + let large_size = 999_999_999; + let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, large_size); + + let filename = path.file_name().unwrap().to_str().unwrap(); + assert!(filename.ends_with(&format!("-{}.json.zst", large_size))); + + // Parse it back + let parsed = parse_size_from_filename(filename).unwrap(); + assert_eq!(parsed, large_size); + } + + #[test] + #[should_panic(expected = "Fingerprint must be at least 4 characters long")] + fn test_entry_path_too_short() { + let cache_dir = Path::new("/cache"); + // Too short after stripping prefix + let _ = entry_path(cache_dir, "pdftract-v1:ab", TEST_OPTS_HASH, 12387); + } +} diff --git a/crates/pdftract-core/src/cache/mod.rs b/crates/pdftract-core/src/cache/mod.rs new file mode 100644 index 0000000..924b3a5 --- /dev/null +++ b/crates/pdftract-core/src/cache/mod.rs @@ -0,0 +1,25 @@ +//! Content-addressed cache layer for extraction results. +//! +//! This module implements Phase 6.9 of the implementation plan: a filesystem-based +//! cache that stores extraction results keyed by PDF fingerprint and extraction options. +//! The cache uses a two-byte prefix scheme to keep directory fan-out balanced even +//! at millions of entries. +//! +//! # Layout +//! +//! ```text +//! / +//! index.json # cache version + metadata +//! sentinel.touched # O_APPEND sentinel for LRU tracking +//! /// # fingerprint-based path +//! -.json.zst # cached extraction, zstd-compressed +//! ``` +//! +//! # Module Structure +//! +//! - [`layout`] — Path construction and directory creation +//! - [`metadata`] — Cache index.json and metadata handling (TODO: 6.9.3) + +pub mod layout; + +pub use layout::{entry_path, CacheIndex, CURRENT_SCHEMA_VERSION}; diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 1dfaab5..60f7929 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -4,6 +4,7 @@ //! processing PDF documents, including the lexer, object parser, and //! text extraction engines. +pub mod cache; pub mod diagnostics; pub mod document; pub mod extract;