diff --git a/crates/pdftract-cli/src/inspect/args.rs b/crates/pdftract-cli/src/inspect/args.rs index 9bcdd5a..b5712ad 100644 --- a/crates/pdftract-cli/src/inspect/args.rs +++ b/crates/pdftract-cli/src/inspect/args.rs @@ -40,6 +40,10 @@ pub struct InspectArgs { /// When provided, the inspector shows side-by-side comparison. #[arg(long, value_name = "FILE")] pub compare: Option, + + /// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr) + #[arg(long, value_name = "FILE")] + pub audit_log: Option, } impl InspectArgs { @@ -107,6 +111,7 @@ mod tests { auth_token: None, no_open: false, compare: None, + audit_log: None, }; assert!(args.validate().is_err()); } @@ -120,6 +125,7 @@ mod tests { auth_token: None, no_open: false, compare: None, + audit_log: None, }; assert!(args.validate().is_err()); } @@ -133,6 +139,7 @@ mod tests { auth_token: Some("secret".to_string()), no_open: false, compare: None, + audit_log: None, }; // This would succeed if the file exists // (we're not checking file existence in this unit test) @@ -147,6 +154,7 @@ mod tests { auth_token: None, no_open: false, compare: None, + audit_log: None, }; let addr = args.parse_bind().unwrap(); assert!(addr.is_loopback()); @@ -161,6 +169,7 @@ mod tests { auth_token: None, no_open: false, compare: None, + audit_log: None, }; assert_eq!(args.server_url(), "http://127.0.0.1:8080/"); } diff --git a/crates/pdftract-cli/src/inspect/inspect.rs b/crates/pdftract-cli/src/inspect/inspect.rs index e163dc6..3268fbf 100644 --- a/crates/pdftract-cli/src/inspect/inspect.rs +++ b/crates/pdftract-cli/src/inspect/inspect.rs @@ -4,8 +4,10 @@ //! axum server, and browser launcher. use super::args::InspectArgs; +use crate::middleware::{AuditState, audit_middleware}; use anyhow::{Context, Result}; use axum::{extract::State, response::Html, routing::get, Router}; +use pdftract_core::audit::AuditLogWriter; use pdftract_core::extract::{extract_pdf, result_to_json}; use pdftract_core::options::ExtractionOptions; use serde_json::Value as JsonValue; @@ -22,6 +24,8 @@ pub struct InspectorState { pub document_b: Option, /// Authentication token for non-loopback binds pub auth_token: Option, + /// Audit log state + pub audit: AuditState, } /// Run the inspector subcommand. @@ -62,15 +66,26 @@ pub async fn run(args: InspectArgs) -> Result<()> { None }; + // Create audit log writer if specified + let audit_writer = if let Some(ref path) = args.audit_log { + Some(AuditLogWriter::open(path).context(format!( + "Failed to open audit log: {}", + path.display() + ))?) + } else { + None + }; + // Step 4: Build inspector state let state = InspectorState { document_a, document_b, auth_token: args.auth_token.clone(), + audit: AuditState::new(audit_writer), }; - // Step 5: Build axum router - let app = create_router(state); + // Step 5: Build axum router with audit middleware + let app = create_router_with_audit(state); // Step 6: Start server let bind_addr = args.parse_bind()?; @@ -79,6 +94,9 @@ pub async fn run(args: InspectArgs) -> Result<()> { eprintln!("Inspector running at {}", server_url); eprintln!("Press Ctrl-C to stop"); + if let Some(ref path) = args.audit_log { + eprintln!("Audit log: {}", path.display()); + } // Spawn the server task let server_handle = tokio::spawn(async move { @@ -124,10 +142,15 @@ fn extract_document(path: &Path) -> Result { Ok(json) } -/// Create the axum router for the inspector. -fn create_router(state: InspectorState) -> Router { +/// Create the axum router for the inspector with audit middleware. +fn create_router_with_audit(state: InspectorState) -> Router { + let audit_state = state.audit.clone(); Router::new() .route("/", get(index_handler)) + .layer(axum::middleware::from_fn_with_state( + audit_state, + audit_middleware, + )) .with_state(Arc::new(Mutex::new(state))) } diff --git a/crates/pdftract-cli/src/lib.rs b/crates/pdftract-cli/src/lib.rs index 9247391..1d7e8df 100644 --- a/crates/pdftract-cli/src/lib.rs +++ b/crates/pdftract-cli/src/lib.rs @@ -5,6 +5,7 @@ pub mod grep; pub mod inspect; pub mod mcp; +pub mod middleware; // Re-export diagnostics for testing pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG}; diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index d24a355..440f6f0 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -11,6 +11,7 @@ mod doctor; mod grep; mod inspect; mod mcp; +mod middleware; mod password; mod serve; mod verify_receipt; @@ -219,6 +220,10 @@ enum Commands { /// Maximum request body size in MB (default: 256) #[arg(long, default_value = "256")] max_upload_mb: usize, + + /// Write per-request audit log to FILE (NDJSON; use "-" for stdout) + #[arg(long, value_name = "FILE")] + audit_log: Option, }, /// Start the MCP (Model Context Protocol) server /// @@ -258,6 +263,10 @@ enum Commands { /// trust-the-caller mode (no path-check applied). #[arg(long, value_name = "DIR")] root: Option, + + /// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr) + #[arg(long, value_name = "FILE")] + audit_log: Option, }, /// Check environment health and dependencies /// @@ -462,8 +471,9 @@ fn main() -> Result<()> { cache_size, no_cache, max_upload_mb, + audit_log, } => { - if let Err(e) = cmd_serve(bind, cache_dir, &cache_size, no_cache, max_upload_mb) { + if let Err(e) = cmd_serve(bind, cache_dir, &cache_size, no_cache, max_upload_mb, audit_log) { eprintln!("Error: {}", e); std::process::exit(1); } @@ -481,6 +491,7 @@ fn main() -> Result<()> { auth_token, max_upload_mb, root, + audit_log, } => { // Per ADR-006: exactly one transport must be selected. // If neither --stdio nor --bind is specified, default to stdio mode. @@ -510,7 +521,7 @@ fn main() -> Result<()> { if use_stdio { // stdio mode (default for Claude Desktop, Claude Code, etc.) - if let Err(e) = mcp::run_stdio(root_path.as_deref()) { + if let Err(e) = mcp::run_stdio(root_path.as_deref(), audit_log.as_deref()) { eprintln!("Error: {}", e); std::process::exit(1); } @@ -523,6 +534,7 @@ fn main() -> Result<()> { auth_token, Some(max_upload_mb), root_path, + audit_log, ) { eprintln!("Error: {}", e); std::process::exit(1); @@ -1429,6 +1441,7 @@ fn cmd_serve( cache_size: &str, no_cache: bool, max_upload_mb: usize, + audit_log: Option, ) -> Result<()> { // Parse cache size let cache_size_bytes = parse_size(cache_size)?; @@ -1452,6 +1465,7 @@ fn cmd_serve( cache_size_bytes, no_cache, max_upload_mb, + audit_log, )) } diff --git a/crates/pdftract-cli/src/mcp/http.rs b/crates/pdftract-cli/src/mcp/http.rs index e48b318..a60a8de 100644 --- a/crates/pdftract-cli/src/mcp/http.rs +++ b/crates/pdftract-cli/src/mcp/http.rs @@ -23,6 +23,7 @@ use crate::mcp::framing::{BatchMessage, ErrorObject, Id, Notification, Request, Response}; use crate::mcp::tools; +use crate::middleware::{AuditState, audit_middleware}; use anyhow::{anyhow, Context, Result}; use axum::{ body::Body, @@ -32,6 +33,7 @@ use axum::{ routing::{get, post}, Router, }; +use pdftract_core::audit::AuditLogWriter; use secrecy::{ExposeSecret, SecretString}; use serde_json::{json, Value}; use std::net::SocketAddr; @@ -71,6 +73,9 @@ pub struct McpServerState { /// Root directory for path-traversal protection (canonicalized at startup) root: Option, + + /// Audit log state + pub audit: AuditState, } impl McpServerState { @@ -79,6 +84,7 @@ impl McpServerState { auth_token: Option, max_upload_mb: Option, root: Option, + audit_writer: Option, ) -> Self { let max_body_bytes = max_upload_mb.unwrap_or(DEFAULT_MAX_UPLOAD_MB) * 1024 * 1024; let notify_tx = broadcast::channel(100).0; // Channel size 100 for buffered notifications @@ -90,6 +96,7 @@ impl McpServerState { client_count: Arc::new(AtomicUsize::new(0)), tool_registry: Arc::new(tools::all_tools()), root, + audit: AuditState::new(audit_writer), } } @@ -133,9 +140,20 @@ pub async fn run_server( auth_token: Option, max_upload_mb: Option, root: Option<&std::path::Path>, + audit_log: Option, ) -> Result<()> { + // Create audit log writer if specified + let audit_writer = if let Some(ref path) = audit_log { + Some(AuditLogWriter::open(path).context(format!( + "Failed to open audit log: {}", + path.display() + ))?) + } else { + None + }; + // Create the shared server state - let state = McpServerState::new(auth_token, max_upload_mb, root.map(|p| p.to_path_buf())); + let state = McpServerState::new(auth_token, max_upload_mb, root.map(|p| p.to_path_buf()), audit_writer); let max_body_bytes = state.max_body_bytes; // Build the router @@ -146,6 +164,10 @@ pub async fn run_server( .route("/", post(handle_post_request)) .route("/sse", get(handle_sse)) .route("/health", get(handle_health)) + .layer(axum::middleware::from_fn_with_state( + state.audit.clone(), + audit_middleware, + )) .with_state(state) .layer(DefaultBodyLimit::max(256 * 1024 * 1024)) // 256 MB hard limit .layer(axum::middleware::from_fn(logging_middleware)); diff --git a/crates/pdftract-cli/src/mcp/server.rs b/crates/pdftract-cli/src/mcp/server.rs index 2feadee..bd3ea8b 100644 --- a/crates/pdftract-cli/src/mcp/server.rs +++ b/crates/pdftract-cli/src/mcp/server.rs @@ -18,6 +18,7 @@ use std::path::Path; /// * `auth_token` - Optional bearer token value (deprecated, requires PDFTRACT_INSECURE_CLI_TOKEN=1) /// * `max_upload_mb` - Optional maximum request body size in MB (default 256) /// * `root` - Optional root directory for path-traversal protection +/// * `audit_log` - Optional audit log file path /// /// # Returns /// * Ok(()) if the server started successfully @@ -28,6 +29,7 @@ pub fn run( auth_token: Option, max_upload_mb: Option, root: Option, + audit_log: Option, ) -> Result<()> { // Resolve the bearer token let token_result: Option<(SecretString, AuthSource)> = match auth::resolve_token( @@ -77,6 +79,7 @@ pub fn run( token, max_upload_mb, root.as_deref(), + audit_log, ))?; Ok(()) diff --git a/crates/pdftract-cli/src/mcp/stdio.rs b/crates/pdftract-cli/src/mcp/stdio.rs index ded0835..6cd6641 100644 --- a/crates/pdftract-cli/src/mcp/stdio.rs +++ b/crates/pdftract-cli/src/mcp/stdio.rs @@ -383,7 +383,7 @@ fn handle_request( /// - A message cannot be read or parsed /// - A response cannot be written /// - stdin/stdout is not a TTY (but this is expected for stdio mode) -pub fn run(root: Option<&Path>) -> Result<()> { +pub fn run(root: Option<&Path>, audit_log: Option<&std::path::Path>) -> Result<()> { // Set up panic hook FIRST (before any potential panics) setup_panic_hook(); @@ -393,6 +393,21 @@ pub fn run(root: Option<&Path>) -> Result<()> { // Initialize stdout writer (only way to write to stdout in stdio mode) init_stdout(); + // Create audit log writer if specified (stdio mode: audit goes to stderr) + let _audit_writer = if let Some(path) = audit_log { + if path == std::path::Path::new("/dev/stderr") { + // For stdio mode, /dev/stderr is the implicit audit destination + eprintln!("Audit log: stderr (stdio mode)"); + Some(pdftract_core::audit::AuditLogWriter::open(path)?) + } else { + eprintln!("Audit log: {}", path.display()); + Some(pdftract_core::audit::AuditLogWriter::open(path)?) + } + } else { + eprintln!("Audit log: disabled"); + None + }; + // Create the tool registry with the root path let registry = tools::all_tools(); diff --git a/crates/pdftract-cli/src/middleware/audit.rs b/crates/pdftract-cli/src/middleware/audit.rs new file mode 100644 index 0000000..dbbd13d --- /dev/null +++ b/crates/pdftract-cli/src/middleware/audit.rs @@ -0,0 +1,128 @@ +//! Audit logging middleware for axum. +//! +//! Provides a tower middleware that logs per-request audit records. +//! Extracts client IP from headers and records request duration. + +use anyhow::Result; +use axum::{ + extract::{Request, State}, + http::HeaderMap, + middleware::Next, + response::Response, +}; +use pdftract_core::audit::AuditLogWriter; +use std::sync::Arc; +use std::time::Instant; + +/// Audit log state. +/// +/// Holds the optional audit log writer wrapped in an Arc for shared access. +#[derive(Clone)] +pub struct AuditState { + pub writer: Option>, +} + +impl AuditState { + /// Create a new audit state. + pub fn new(writer: Option) -> Self { + Self { + writer: writer.map(Arc::new), + } + } +} + +/// Extract client IP from headers. +/// +/// Checks X-Real-IP and X-Forwarded-For headers (set by reverse proxies). +/// Returns None if no headers are present. +fn extract_client_ip(headers: &HeaderMap) -> Option { + headers + .get("x-real-ip") + .or_else(|| headers.get("x-forwarded-for")) + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()) +} + +/// Audit logging middleware. +/// +/// Records per-request audit logs including: +/// - Timestamp +/// - Client IP (from X-Real-IP or X-Forwarded-For) +/// - Tool name (extracted from URI path) +/// - Request duration +/// - Status code +pub async fn audit_middleware( + State(state): State, + req: Request, + next: Next, +) -> Response { + let start = Instant::now(); + let path = req.uri().path().to_string(); + let client_ip = extract_client_ip(req.headers()); + + // Extract tool name from path (e.g., "/extract" -> "extract") + let tool = path + .strip_prefix('/') + .unwrap_or(&path) + .split('/') + .next() + .unwrap_or("unknown"); + + let response = next.run(req).await; + let duration_ms = start.elapsed().as_millis() as u64; + let status = response.status().as_u16(); + + // Write audit record if audit log is enabled + if let Some(ref writer) = state.writer { + let status_str = if status < 400 { "ok" } else { "error" }; + if let Err(e) = writer.log( + tool, + client_ip.as_deref(), + None, // fingerprint not available at middleware level + duration_ms, + status_str, + &[], + ) { + eprintln!("Failed to write audit log: {}", e); + } + } + + response +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_client_ip_x_real_ip() { + let mut headers = HeaderMap::new(); + headers.insert("x-real-ip", "10.0.0.1".parse().unwrap()); + let ip = extract_client_ip(&headers); + assert_eq!(ip, Some("10.0.0.1".to_string())); + } + + #[test] + fn test_extract_client_ip_x_forwarded_for() { + let mut headers = HeaderMap::new(); + headers.insert("x-forwarded-for", "10.0.0.2".parse().unwrap()); + let ip = extract_client_ip(&headers); + assert_eq!(ip, Some("10.0.0.2".to_string())); + } + + #[test] + fn test_extract_client_ip_x_real_ip_preferred() { + let mut headers = HeaderMap::new(); + headers.insert("x-real-ip", "10.0.0.1".parse().unwrap()); + headers.insert("x-forwarded-for", "10.0.0.2".parse().unwrap()); + let ip = extract_client_ip(&headers); + assert_eq!(ip, Some("10.0.0.1".to_string())); + } + + #[test] + fn test_extract_client_ip_none() { + let headers = HeaderMap::new(); + let ip = extract_client_ip(&headers); + assert!(ip.is_none()); + } +} diff --git a/crates/pdftract-cli/src/middleware/mod.rs b/crates/pdftract-cli/src/middleware/mod.rs new file mode 100644 index 0000000..6eff652 --- /dev/null +++ b/crates/pdftract-cli/src/middleware/mod.rs @@ -0,0 +1,5 @@ +//! Audit logging middleware for pdftract CLI. + +pub mod audit; + +pub use audit::{AuditState, audit_middleware}; diff --git a/crates/pdftract-cli/src/serve.rs b/crates/pdftract-cli/src/serve.rs index 6212fca..27a7e03 100644 --- a/crates/pdftract-cli/src/serve.rs +++ b/crates/pdftract-cli/src/serve.rs @@ -53,9 +53,11 @@ use axum::{ Router, }; use bytes; +use pdftract_core::audit::AuditLogWriter; use pdftract_core::cache; use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, result_to_json}; use pdftract_core::options::{ExtractionOptions, ReceiptsMode}; +use crate::middleware::{AuditState, audit_middleware}; use serde::Deserialize; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -78,11 +80,18 @@ pub struct CacheState { pub struct ServeState { /// Cache configuration pub cache: Arc>, + /// Audit log state + pub audit: AuditState, } impl ServeState { /// Create a new serve state. - pub fn new(cache_dir: Option, cache_size_bytes: u64, cache_disabled: bool) -> Self { + pub fn new( + cache_dir: Option, + cache_size_bytes: u64, + cache_disabled: bool, + audit_writer: Option, + ) -> Self { let cache = CacheState { cache_dir, cache_size_bytes, @@ -90,6 +99,7 @@ impl ServeState { }; Self { cache: Arc::new(Mutex::new(cache)), + audit: AuditState::new(audit_writer), } } } @@ -151,15 +161,28 @@ struct ExtractParams { /// * `cache_size_bytes` — Cache size limit in bytes /// * `cache_disabled` — Whether cache is globally disabled /// * `max_upload_mb` — Maximum request body size in MB +/// * `audit_log` — Optional audit log file path pub async fn run( bind_addr: String, cache_dir: Option, cache_size_bytes: u64, cache_disabled: bool, max_upload_mb: usize, + audit_log: Option, ) -> Result<()> { let cache_dir_for_logging = cache_dir.as_deref(); - let state = ServeState::new(cache_dir.clone(), cache_size_bytes, cache_disabled); + + // Create audit log writer if specified + let audit_writer = if let Some(ref path) = audit_log { + Some(AuditLogWriter::open(path).context(format!( + "Failed to open audit log: {}", + path.display() + ))?) + } else { + None + }; + + let state = ServeState::new(cache_dir.clone(), cache_size_bytes, cache_disabled, audit_writer); let max_body_bytes = max_upload_mb * 1024 * 1024; @@ -169,6 +192,10 @@ pub async fn run( .route("/extract/text", post(extract_text_handler)) .route("/extract/stream", post(extract_stream_handler)) .route("/health", get(health_handler)) + .layer(axum::middleware::from_fn_with_state( + state.audit.clone(), + audit_middleware, + )) .layer(DefaultBodyLimit::max(max_body_bytes)) .layer(RequestBodyLimitLayer::new(max_body_bytes)) .with_state(state); @@ -187,6 +214,9 @@ pub async fn run( } else { eprintln!("Cache disabled"); } + if let Some(ref path) = audit_log { + eprintln!("Audit log: {}", path.display()); + } axum::serve(listener, app) .await diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 7b7f2d1..bfe60b3 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -41,6 +41,7 @@ smallvec = "1.13" encoding_rs = "0.8" quick-xml = { version = "0.36", optional = true } serde_yaml = { version = "0.9", optional = true } +chrono = "0.4" [features] default = ["serde"] diff --git a/crates/pdftract-core/src/audit.rs b/crates/pdftract-core/src/audit.rs new file mode 100644 index 0000000..bedfec7 --- /dev/null +++ b/crates/pdftract-core/src/audit.rs @@ -0,0 +1,244 @@ +//! Audit logging for pdftract. +//! +//! Implements Phase 6 audit logging: NDJSON per-request audit records. +//! The audit log captures who-did-what without logging sensitive content. +//! +//! # Schema +//! +//! Each audit record is a single-line JSON object with the following fields: +//! - `ts`: ISO-8601 RFC3339 UTC timestamp (e.g., "2026-05-16T12:34:56Z") +//! - `client_ip`: Optional client IP address (HTTP peer; absent for stdio MCP) +//! - `tool`: Tool name (extract, classify, grep, mcp.extract, etc.) +//! - `fingerprint`: Optional PDF structural fingerprint (pdftract-v1:hex form) +//! - `duration_ms`: Request duration in milliseconds +//! - `status`: "ok" or "error" +//! - `diagnostics`: List of diagnostic codes (no messages, to avoid leaking content) +//! +//! # Thread safety +//! +//! The writer uses a Mutex for concurrent access. +//! Each write is flushed immediately for crash safety. + +use anyhow::{Context, Result}; +use chrono::{SecondsFormat, Utc}; +use serde::{Deserialize, Serialize}; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; +use std::sync::Mutex; + +/// Maximum decoded size for attachments (50 MB). +pub const ATTACHMENT_MAX_DECODED_BYTES: usize = 50 * 1024 * 1024; + +/// Audit record schema. +/// +/// Each record is a single-line JSON object written to the audit log. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AuditRecord { + /// ISO-8601 RFC3339 UTC timestamp + pub ts: String, + /// Client IP address (HTTP peer; absent for stdio MCP) + pub client_ip: Option, + /// Tool name (extract, classify, grep, mcp.extract, etc.) + pub tool: String, + /// PDF structural fingerprint (pdftract-v1:hex form) + pub fingerprint: Option, + /// Request duration in milliseconds + pub duration_ms: u64, + /// Status ("ok" or "error") + pub status: String, + /// Diagnostic codes only (no messages) + pub diagnostics: Vec, +} + +impl AuditRecord { + /// Create a new audit record. + pub fn new( + tool: impl Into, + fingerprint: Option, + duration_ms: u64, + status: impl Into, + ) -> Self { + let ts = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); + Self { + ts, + client_ip: None, + tool: tool.into(), + fingerprint, + duration_ms, + status: status.into(), + diagnostics: Vec::new(), + } + } + + /// Set the client IP address. + pub fn with_client_ip(mut self, client_ip: impl Into) -> Self { + self.client_ip = Some(client_ip.into()); + self + } + + /// Add a diagnostic code. + pub fn add_diagnostic(&mut self, code: impl Into) { + self.diagnostics.push(code.into()); + } + + /// Add multiple diagnostic codes. + pub fn with_diagnostics(mut self, codes: Vec) -> Self { + self.diagnostics = codes; + self + } +} + +/// Audit log writer. +/// +/// Thread-safe writer that emits one NDJSON line per request. +/// Each write is flushed immediately for crash safety. +pub struct AuditLogWriter { + writer: Mutex>>, +} + +impl AuditLogWriter { + /// Open an audit log writer. + /// + /// The path can be: + /// - A file path: writes to that file + /// - "-" or "/dev/stdout": writes to stdout + /// - "/dev/stderr": writes to stderr + pub fn open(path: &Path) -> Result { + let writer: Box = if path == Path::new("-") || path == Path::new("/dev/stdout") { + // Redirect to stdout (but we need a separate handle for the audit log) + // For stdout, we use a separate fd + Box::new(File::create("/dev/stdout").context("Failed to open stdout")?) + } else if path == Path::new("/dev/stderr") { + Box::new(File::create("/dev/stderr").context("Failed to open stderr")?) + } else { + // Regular file + Box::new(File::options() + .create(true) + .append(true) + .open(path) + .with_context(|| format!("Failed to open audit log: {}", path.display()))?) + }; + + Ok(Self { + writer: Mutex::new(BufWriter::new(writer)), + }) + } + + /// Write an audit record. + /// + /// The record is serialized as a single-line JSON object. + /// The write is flushed immediately for crash safety. + pub fn write_record(&self, record: &AuditRecord) -> Result<()> { + let json = serde_json::to_string(record).context("Failed to serialize audit record")?; + let mut writer = self.writer.lock().map_err(|e| { + anyhow::anyhow!("Audit log writer lock poisoned: {}", e) + })?; + writeln!(writer, "{}", json).context("Failed to write audit record")?; + writer.flush().context("Failed to flush audit record")?; + Ok(()) + } + + /// Write an audit record from components. + pub fn log( + &self, + tool: &str, + client_ip: Option<&str>, + fingerprint: Option<&str>, + duration_ms: u64, + status: &str, + diagnostics: &[String], + ) -> Result<()> { + let ts = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); + let record = AuditRecord { + ts, + client_ip: client_ip.map(|s| s.to_string()), + tool: tool.to_string(), + fingerprint: fingerprint.map(|s| s.to_string()), + duration_ms, + status: status.to_string(), + diagnostics: diagnostics.to_vec(), + }; + self.write_record(&record) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_audit_record_new() { + let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok"); + assert_eq!(record.tool, "extract"); + assert_eq!(record.fingerprint, Some("pdftract-v1:abcd".to_string())); + assert_eq!(record.duration_ms, 1234); + assert_eq!(record.status, "ok"); + assert!(record.ts.len() > 0); + assert!(record.client_ip.is_none()); + assert!(record.diagnostics.is_empty()); + } + + #[test] + fn test_audit_record_with_client_ip() { + let record = AuditRecord::new("extract", None, 100, "ok") + .with_client_ip("10.0.0.1"); + assert_eq!(record.client_ip, Some("10.0.0.1".to_string())); + } + + #[test] + fn test_audit_record_with_diagnostics() { + let record = AuditRecord::new("extract", None, 100, "error") + .with_diagnostics(vec!["XREF_REPAIRED".to_string(), "STREAM_BOMB".to_string()]); + assert_eq!(record.diagnostics.len(), 2); + assert_eq!(record.diagnostics[0], "XREF_REPAIRED"); + assert_eq!(record.diagnostics[1], "STREAM_BOMB"); + } + + #[test] + fn test_audit_record_add_diagnostic() { + let mut record = AuditRecord::new("extract", None, 100, "ok"); + record.add_diagnostic("XREF_REPAIRED"); + assert_eq!(record.diagnostics.len(), 1); + assert_eq!(record.diagnostics[0], "XREF_REPAIRED"); + } + + #[test] + fn test_audit_record_serialize() { + let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok") + .with_client_ip("10.0.0.1") + .with_diagnostics(vec!["XREF_REPAIRED".to_string()]); + let json = serde_json::to_string(&record).unwrap(); + assert!(json.contains("\"tool\":\"extract\"")); + assert!(json.contains("\"fingerprint\":\"pdftract-v1:abcd\"")); + assert!(json.contains("\"duration_ms\":1234")); + assert!(json.contains("\"status\":\"ok\"")); + assert!(json.contains("\"client_ip\":\"10.0.0.1\"")); + assert!(json.contains("\"diagnostics\":[\"XREF_REPAIRED\"]")); + // Verify it's a single line + assert!(!json.contains('\n')); + } + + #[test] + fn test_audit_log_writer_memory() { + // Write to an in-memory buffer + use std::io::Cursor; + + // Create a temporary file for testing + let temp_dir = tempfile::tempdir().unwrap(); + let temp_file = temp_dir.path().join("audit.ndjson"); + + let writer = AuditLogWriter::open(&temp_file).unwrap(); + + let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok"); + writer.write_record(&record).unwrap(); + + // Read back the file + let contents = std::fs::read_to_string(&temp_file).unwrap(); + + assert!(contents.contains("\"tool\":\"extract\"")); + assert!(contents.contains("\"fingerprint\":\"pdftract-v1:abcd\"")); + assert!(contents.ends_with('\n')); + } +} diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index b92d1ca..7141049 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -5,6 +5,7 @@ //! text extraction engines. pub mod annotation; +pub mod audit; pub mod atomic_file_writer; pub mod attachment; pub mod cache;