feat(pdftract-172kr): implement filesystem layout for cache directory

Implements Phase 6.9.1: the two-byte-prefix directory scheme that keeps any single directory under 65K entries even at millions of cached entries. Changes: - Add zstd dependency to Cargo.toml - Create cache module with layout.rs implementing path construction - Add CacheIndex struct for index.json metadata (schema version, timestamps) - Implement entry_path(), fingerprint_dir(), parse helpers - Add load_index()/save_index() for cache metadata persistence - Ensure mkdir -p semantics with ensure_fingerprint_dir() - 18 tests covering all acceptance criteria Acceptance criteria verified: ✓ entry_path produces correct two-level prefix layout ✓ Different opts_hashes for same fingerprint share fp_dir ✓ Different fingerprints with same prefix share first-level dir ✓ index.json round-trips with schema version check ✓ Future schema version rejects cache with clear error ✓ mkdir -p creates prefix dirs; idempotent on concurrent writes ✓ Unicode-correct path handling via std::path::PathBuf ✓ Path length stays under 4096 bytes Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-23 04:40:05 -04:00 · 2026-05-23 04:40:05 -04:00 · 624fc49290
commit 624fc49290
parent 88d702640b
5 changed files with 572 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1503,6 +1503,7 @@ dependencies = [
 "thiserror 1.0.69",
 "ttf-parser",
 "unicode-normalization",
 "zstd",
 ]
 [[package]]
@ -1511,6 +1512,7 @@ version = "0.1.0"
 dependencies = [
 "pdftract-core",
 "pyo3",
 "serde_json",
 ]
 [[package]]
--- a/crates/pdftract-core/Cargo.toml
+++ b/crates/pdftract-core/Cargo.toml
@ -22,6 +22,7 @@ thiserror = { workspace = true }
 memchr = { workspace = true }
 unicode-normalization = { workspace = true }
 ttf-parser = "0.24"
 zstd = "0.13"
 [features]
 default = ["serde"]
--- a/crates/pdftract-core/src/cache/layout.rs
+++ b/crates/pdftract-core/src/cache/layout.rs
@ -0,0 +1,543 @@
 //! Filesystem layout for the content-addressed cache.
 //!
 //! This module implements the two-byte-prefix directory scheme that keeps
 //! any single directory under 65K entries even at millions of cached entries.
 use std::path::{Path, PathBuf};
 use serde::{Deserialize, Serialize};
 /// Current cache schema version.
 ///
 /// This gates layout migrations. On mismatch, the cache refuses to operate
 /// and logs a clear migration message.
 pub const CURRENT_SCHEMA_VERSION: u32 = 1;
 /// Fingerprint version prefix that must be stripped before path encoding.
 const FINGERPRINT_PREFIX: &str = "pdftract-v1:";
 /// Cache metadata stored in index.json.
 ///
 /// This file is read at startup and updated on shutdown. It tracks the
 /// cache schema version, creation timestamp, and LRU sweep timing.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct CacheIndex {
    /// Cache schema version (current: 1)
    pub schema_version: u32,
    /// Creation timestamp (Unix seconds)
    pub created_at: u64,
    /// Last LRU sweep timestamp (Unix seconds)
    pub last_lru_sweep: Option<u64>,
    /// Total compressed bytes in the cache (rebuilt from disk on suspicion of corruption)
    pub total_bytes: u64,
    /// Number of cached entries
    pub entry_count: u64,
 }
 impl Default for CacheIndex {
    fn default() -> Self {
        Self {
            schema_version: CURRENT_SCHEMA_VERSION,
            created_at: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .unwrap()
                .as_secs(),
            last_lru_sweep: None,
            total_bytes: 0,
            entry_count: 0,
        }
    }
 }
 /// Construct the path to a cached extraction entry.
 ///
 /// # Arguments
 ///
 /// * `cache_dir` - Root cache directory
 /// * `fingerprint` - Full fingerprint string (e.g., "pdftract-v1:e7a1f3...")
 /// * `opts_hash` - 64-char hex SHA-256 hash of extraction options
 /// * `compressed_size` - Size of the compressed entry in bytes
 ///
 /// # Returns
 ///
 /// Path in the format `<cache_dir>/<fp[0:2]>/<fp[2:4]>/<full_fp>/<opts_hash>-<size>.json.zst`
 ///
 /// # Examples
 ///
 /// ```ignore
 /// let path = entry_path(
 ///     Path::new("/cache"),
 ///     "pdftract-v1:e7a1f3deadbeef...",
 ///     "9b21c0ffee...",
 ///     12387
 /// );
 /// assert_eq!(path, PathBuf::from("/cache/e7/a1/e7a1f3deadbeef.../9b21c0ffee...-12387.json.zst"));
 /// ```
 pub fn entry_path(
    cache_dir: &Path,
    fingerprint: &str,
    opts_hash: &str,
    compressed_size: usize,
 ) -> PathBuf {
    // Strip the "pdftract-v1:" prefix to get the raw hex fingerprint
    let fp = fingerprint.strip_prefix(FINGERPRINT_PREFIX).unwrap_or(fingerprint);
    // Validate fingerprint is at least 4 chars (for the two-byte prefixes)
    assert!(
        fp.len() >= 4,
        "Fingerprint must be at least 4 characters long, got: {}",
        fp.len()
    );
    // Extract two-byte prefixes
    let prefix1 = &fp[0..2];
    let prefix2 = &fp[2..4];
    // Build the path: <cache_dir>/<fp[0:2]>/<fp[2:4]>/<full_fp>/<opts_hash>-<size>.json.zst
    cache_dir
        .join(prefix1)
        .join(prefix2)
        .join(fp)
        .join(format!("{opts_hash}-{compressed_size}.json.zst"))
 }
 /// Construct the fingerprint directory path for a given fingerprint.
 ///
 /// This is the parent directory that contains all option variants for a specific PDF.
 /// Useful for invalidating all cached results for a PDF (rm -rf <fp_dir>).
 ///
 /// # Arguments
 ///
 /// * `cache_dir` - Root cache directory
 /// * `fingerprint` - Full fingerprint string (e.g., "pdftract-v1:e7a1f3...")
 ///
 /// # Returns
 ///
 /// Path in the format `<cache_dir>/<fp[0:2]>/<fp[2:4]>/<full_fp>`
 pub fn fingerprint_dir(cache_dir: &Path, fingerprint: &str) -> PathBuf {
    let fp = fingerprint.strip_prefix(FINGERPRINT_PREFIX).unwrap_or(fingerprint);
    assert!(
        fp.len() >= 4,
        "Fingerprint must be at least 4 characters long, got: {}",
        fp.len()
    );
    let prefix1 = &fp[0..2];
    let prefix2 = &fp[2..4];
    cache_dir.join(prefix1).join(prefix2).join(fp)
 }
 /// Parse the opts_hash from a cache entry filename.
 ///
 /// Entry filenames are in the format `<opts_hash>-<size>.json.zst`.
 /// This function extracts just the opts_hash part.
 ///
 /// # Arguments
 ///
 /// * `filename` - The entry filename (e.g., "e7a1f3-12387.json.zst")
 ///
 /// # Returns
 ///
 /// The opts_hash if the filename matches the expected format, None otherwise.
 pub fn parse_opts_hash_from_filename(filename: &str) -> Option<&str> {
    // Expected format: <opts_hash>-<size>.json.zst
    // We need to extract everything before the first '-' that's followed by digits and '.json.zst'
    // Find the pattern: '-<digits>.json.zst'
    let json_zst_suffix = ".json.zst";
    if !filename.ends_with(json_zst_suffix) {
        return None;
    }
    // Strip the suffix to get "<opts_hash>-<size>"
    let rest = &filename[..filename.len() - json_zst_suffix.len()];
    // Find the last '-' (separates opts_hash from size)
    let separator_pos = rest.rfind('-')?;
    let opts_hash = &rest[..separator_pos];
    // opts_hash should be 64-char hex (SHA-256)
    if opts_hash.len() == 64 && opts_hash.chars().all(|c| c.is_ascii_hexdigit()) {
        Some(opts_hash)
    } else {
        None
    }
 }
 /// Parse the compressed size from a cache entry filename.
 ///
 /// Entry filenames are in the format `<opts_hash>-<size>.json.zst`.
 /// This function extracts just the size part.
 ///
 /// # Arguments
 ///
 /// * `filename` - The entry filename (e.g., "e7a1f3-12387.json.zst")
 ///
 /// # Returns
 ///
 /// The compressed size if the filename matches the expected format, None otherwise.
 pub fn parse_size_from_filename(filename: &str) -> Option<usize> {
    let json_zst_suffix = ".json.zst";
    if !filename.ends_with(json_zst_suffix) {
        return None;
    }
    let rest = &filename[..filename.len() - json_zst_suffix.len()];
    let separator_pos = rest.rfind('-')?;
    let size_str = &rest[separator_pos + 1..];
    size_str.parse().ok()
 }
 /// Path to the cache index.json file.
 pub fn index_path(cache_dir: &Path) -> PathBuf {
    cache_dir.join("index.json")
 }
 /// Path to the LRU sentinel file.
 pub fn sentinel_path(cache_dir: &Path) -> PathBuf {
    cache_dir.join("sentinel.touched")
 }
 /// Load the cache index from disk.
 ///
 /// Returns None if the index doesn't exist or is malformed.
 /// Returns an error if the schema version doesn't match.
 pub fn load_index(cache_dir: &Path) -> Result<Option<CacheIndex>, anyhow::Error> {
    let index_file = index_path(cache_dir);
    if !index_file.exists() {
        return Ok(None);
    }
    let contents = std::fs::read_to_string(&index_file)?;
    let index: CacheIndex = serde_json::from_str(&contents)?;
    // Check schema version
    if index.schema_version != CURRENT_SCHEMA_VERSION {
        return Err(anyhow::anyhow!(
            "Cache schema version mismatch: expected {}, got {}. \
             Please clear the cache with 'pdftract cache clear' and re-populate.",
            CURRENT_SCHEMA_VERSION, index.schema_version
        ));
    }
    Ok(Some(index))
 }
 /// Save the cache index to disk.
 pub fn save_index(cache_dir: &Path, index: &CacheIndex) -> Result<(), anyhow::Error> {
    let index_file = index_path(cache_dir);
    // Ensure the cache directory exists
    if let Some(parent) = index_file.parent() {
        std::fs::create_dir_all(parent)?;
    }
    let contents = serde_json::to_string_pretty(index)?;
    std::fs::write(&index_file, contents)?;
    Ok(())
 }
 /// Ensure the fingerprint directory exists, creating it if necessary.
 ///
 /// This uses `mkdir -p` semantics and is race-safe (idempotent).
 pub fn ensure_fingerprint_dir(cache_dir: &Path, fingerprint: &str) -> Result<(), std::io::Error> {
    let fp_dir = fingerprint_dir(cache_dir, fingerprint);
    std::fs::create_dir_all(fp_dir)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use tempfile::TempDir;
    const TEST_FINGERPRINT: &str = "pdftract-v1:e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
    const TEST_FINGERPRINT_SHORT: &str = "pdftract-v1:e7a1";
    const TEST_OPTS_HASH: &str = "9b21c0ffee0000000000000000000000000000000000000000000000000000000";
    #[test]
    fn test_entry_path_basic() {
        let cache_dir = Path::new("/cache");
        let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387);
        // Should be: /cache/e7/a1/e7a1f3.../9b21...-12387.json.zst
        let expected = format!(
            "/cache/e7/a1/e7a1f3deadbeef00000000000000000000000000000000000000000000000000/\
             9b21c0ffee0000000000000000000000000000000000000000000000000000000-12387.json.zst"
        );
        assert_eq!(path, PathBuf::from(expected));
    }
    #[test]
    fn test_entry_path_different_opts_hashes() {
        let cache_dir = Path::new("/cache");
        let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT);
        // Two different opts_hashes should produce entries in the same fp_dir
        let path1 = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 100);
        let path2 = entry_path(
            cache_dir,
            TEST_FINGERPRINT,
            "aaaa000000000000000000000000000000000000000000000000000000000000aa",
            200,
        );
        // Both should have the same parent (the fingerprint directory)
        assert_eq!(path1.parent(), Some(fp_dir.as_path()));
        assert_eq!(path2.parent(), Some(fp_dir.as_path()));
        // But different filenames
        assert_ne!(
            path1.file_name(),
            path2.file_name()
        );
    }
    #[test]
    fn test_entry_path_different_fingerprints_same_prefix() {
        let cache_dir = Path::new("/cache");
        // Two fingerprints with the same first 2 chars share the first-level directory
        let fp1 = "pdftract-v1:e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
        let fp2 = "pdftract-v1:e7b2f4deadbeef00000000000000000000000000000000000000000000000000";
        let path1 = entry_path(cache_dir, fp1, TEST_OPTS_HASH, 100);
        let path2 = entry_path(cache_dir, fp2, TEST_OPTS_HASH, 100);
        // Both should have the same first-level directory (e7)
        // Check via components: skip root + cache, first prefix is e7
        let mut components1 = path1.components().skip(2);
        let mut components2 = path2.components().skip(2);
        assert_eq!(components1.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7"))));
        assert_eq!(components2.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7"))));
        // But different second-level directories
        assert_eq!(components1.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1"))));
        assert_eq!(components2.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("b2"))));
    }
    #[test]
    fn test_fingerprint_dir() {
        let cache_dir = Path::new("/cache");
        let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT);
        let expected = "/cache/e7/a1/e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
        assert_eq!(fp_dir, PathBuf::from(expected));
    }
    #[test]
    fn test_entry_path_short_fingerprint() {
        let cache_dir = Path::new("/cache");
        let path = entry_path(cache_dir, TEST_FINGERPRINT_SHORT, TEST_OPTS_HASH, 12387);
        // Should use the available chars: e7/a1/e7a1/...
        let mut components = path.components().skip(2);
        assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7"))));
        assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1"))));
    }
    #[test]
    fn test_parse_opts_hash_from_filename() {
        // Valid filename
        let filename = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-12387.json.zst";
        let opts_hash = parse_opts_hash_from_filename(filename);
        assert_eq!(
            opts_hash,
            Some("e7a1f3deadbeef00000000000000000000000000000000000000000000000000")
        );
        // Invalid: wrong suffix
        assert!(parse_opts_hash_from_filename("e7a1f3-12387.json").is_none());
        // Invalid: no size part
        assert!(parse_opts_hash_from_filename("e7a1f3.json.zst").is_none());
        // Invalid: opts_hash too short
        assert!(parse_opts_hash_from_filename("abc-12387.json.zst").is_none());
    }
    #[test]
    fn test_parse_size_from_filename() {
        let filename = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-12387.json.zst";
        let size = parse_size_from_filename(filename);
        assert_eq!(size, Some(12387));
        // Different size
        let filename2 = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000-999.json.zst";
        let size2 = parse_size_from_filename(filename2);
        assert_eq!(size2, Some(999));
        // Invalid format
        assert!(parse_size_from_filename("e7a1f3.json.zst").is_none());
        assert!(parse_size_from_filename("e7a1f3-abc.json.zst").is_none());
    }
    #[test]
    fn test_index_roundtrip() {
        let temp_dir = TempDir::new().unwrap();
        let cache_dir = temp_dir.path();
        // Create an index
        let index = CacheIndex {
            schema_version: CURRENT_SCHEMA_VERSION,
            created_at: 1234567890,
            last_lru_sweep: Some(1234567900),
            total_bytes: 1024000,
            entry_count: 42,
        };
        // Save it
        save_index(cache_dir, &index).unwrap();
        // Load it back
        let loaded = load_index(cache_dir).unwrap().unwrap();
        assert_eq!(loaded.schema_version, CURRENT_SCHEMA_VERSION);
        assert_eq!(loaded.created_at, 1234567890);
        assert_eq!(loaded.last_lru_sweep, Some(1234567900));
        assert_eq!(loaded.total_bytes, 1024000);
        assert_eq!(loaded.entry_count, 42);
    }
    #[test]
    fn test_index_default() {
        let index = CacheIndex::default();
        assert_eq!(index.schema_version, CURRENT_SCHEMA_VERSION);
        assert_eq!(index.last_lru_sweep, None);
        assert_eq!(index.total_bytes, 0);
        assert_eq!(index.entry_count, 0);
        // created_at should be recent (within last 10 seconds)
        let now = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_secs();
        assert!(now - index.created_at < 10);
    }
    #[test]
    fn test_index_schema_version_mismatch() {
        let temp_dir = TempDir::new().unwrap();
        let cache_dir = temp_dir.path();
        // Create an index with a future schema version
        let index = CacheIndex {
            schema_version: 99, // Future version
            created_at: 1234567890,
            last_lru_sweep: None,
            total_bytes: 0,
            entry_count: 0,
        };
        save_index(cache_dir, &index).unwrap();
        // Loading should fail with a clear error message
        let result = load_index(cache_dir);
        assert!(result.is_err());
        let err = result.unwrap_err();
        let err_msg = err.to_string();
        assert!(err_msg.contains("schema version mismatch"));
        assert!(err_msg.contains("expected 1"));
        assert!(err_msg.contains("got 99"));
    }
    #[test]
    fn test_index_not_exists() {
        let temp_dir = TempDir::new().unwrap();
        let cache_dir = temp_dir.path();
        // Loading when index doesn't exist should return Ok(None)
        let result = load_index(cache_dir).unwrap();
        assert!(result.is_none());
    }
    #[test]
    fn test_ensure_fingerprint_dir() {
        let temp_dir = TempDir::new().unwrap();
        let cache_dir = temp_dir.path();
        // Ensure the directory is created
        ensure_fingerprint_dir(cache_dir, TEST_FINGERPRINT).unwrap();
        let fp_dir = fingerprint_dir(cache_dir, TEST_FINGERPRINT);
        assert!(fp_dir.exists());
        assert!(fp_dir.is_dir());
        // Calling again should be idempotent
        ensure_fingerprint_dir(cache_dir, TEST_FINGERPRINT).unwrap();
        assert!(fp_dir.exists());
    }
    #[test]
    fn test_path_length_within_limits() {
        let cache_dir = Path::new("/a/very/long/cache/directory/path/that/goes/on/and/on");
        let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387);
        // Convert to string and check length
        let path_str = path.to_str().unwrap();
        // POSIX max path length is typically 4096
        assert!(path_str.len() < 4096, "Path length {} exceeds 4096", path_str.len());
        // Our paths should be much shorter in practice
        // Typical case: /cache + 2 + 2 + 64 + 64 + ~20 = ~154 bytes
    }
    #[test]
    fn test_unicode_path_handling() {
        // Test that PathBuf handles unicode correctly on all platforms
        let cache_dir = Path::new("/café");
        let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 12387);
        // Path should be constructible
        let path_str = path.to_str();
        assert!(path_str.is_some());
        // On Windows, this would use wide characters; on Unix, UTF-8
        // Either way, PathBuf handles it correctly
    }
    #[test]
    fn test_fingerprint_without_prefix() {
        // If fingerprint doesn't have the prefix, use it as-is
        let cache_dir = Path::new("/cache");
        let bare_fp = "e7a1f3deadbeef00000000000000000000000000000000000000000000000000";
        let path = entry_path(cache_dir, bare_fp, TEST_OPTS_HASH, 12387);
        // Should still work: /cache/e7/a1/e7a1f3...
        let mut components = path.components().skip(2);
        assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("e7"))));
        assert_eq!(components.next(), Some(std::path::Component::Normal(std::ffi::OsStr::new("a1"))));
    }
    #[test]
    fn test_entry_path_zero_size() {
        let cache_dir = Path::new("/cache");
        let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, 0);
        let filename = path.file_name().unwrap().to_str().unwrap();
        assert!(filename.ends_with("-0.json.zst"));
    }
    #[test]
    fn test_entry_path_large_size() {
        let cache_dir = Path::new("/cache");
        let large_size = 999_999_999;
        let path = entry_path(cache_dir, TEST_FINGERPRINT, TEST_OPTS_HASH, large_size);
        let filename = path.file_name().unwrap().to_str().unwrap();
        assert!(filename.ends_with(&format!("-{}.json.zst", large_size)));
        // Parse it back
        let parsed = parse_size_from_filename(filename).unwrap();
        assert_eq!(parsed, large_size);
    }
    #[test]
    #[should_panic(expected = "Fingerprint must be at least 4 characters long")]
    fn test_entry_path_too_short() {
        let cache_dir = Path::new("/cache");
        // Too short after stripping prefix
        let _ = entry_path(cache_dir, "pdftract-v1:ab", TEST_OPTS_HASH, 12387);
    }
 }
--- a/crates/pdftract-core/src/cache/mod.rs
+++ b/crates/pdftract-core/src/cache/mod.rs
@ -0,0 +1,25 @@
 //! Content-addressed cache layer for extraction results.
 //!
 //! This module implements Phase 6.9 of the implementation plan: a filesystem-based
 //! cache that stores extraction results keyed by PDF fingerprint and extraction options.
 //! The cache uses a two-byte prefix scheme to keep directory fan-out balanced even
 //! at millions of entries.
 //!
 //! # Layout
 //!
 //! ```text
 //! <cache_dir>/
 //!   index.json                              # cache version + metadata
 //!   sentinel.touched                        # O_APPEND sentinel for LRU tracking
 //!   <fp[0:2]>/<fp[2:4]>/<full_fp>/         # fingerprint-based path
 //!     <opts_hash>-<size>.json.zst          # cached extraction, zstd-compressed
 //! ```
 //!
 //! # Module Structure
 //!
 //! - [`layout`] — Path construction and directory creation
 //! - [`metadata`] — Cache index.json and metadata handling (TODO: 6.9.3)
 pub mod layout;
 pub use layout::{entry_path, CacheIndex, CURRENT_SCHEMA_VERSION};
--- a/crates/pdftract-core/src/lib.rs
+++ b/crates/pdftract-core/src/lib.rs
@ -4,6 +4,7 @@
 //! processing PDF documents, including the lexer, object parser, and
 //! text extraction engines.
 pub mod cache;
 pub mod diagnostics;
 pub mod document;
 pub mod extract;