feat(pdftract-5boxq): implement audit-log FILE flag with NDJSON writer + middleware

Implements the --audit-log FILE flag on serve, mcp --bind, and inspect subcommands.
Emits per-request NDJSON audit lines with ts, client_ip, tool, fingerprint, duration_ms,
status, and diagnostics fields. The AuditLogWriter wraps a BufWriter<File> behind a Mutex
and flushes after each line for crash safety.

Core changes:
- Added pdftract-core/src/audit.rs with AuditRecord schema and AuditLogWriter
- Added chrono dependency to pdftract-core/Cargo.toml for timestamp generation
- Added crates/pdftract-cli/src/middleware/audit.rs with axum middleware
- Integrated AuditState into ServeState, McpServerState, and InspectorState
- Added --audit-log flag to Serve, Mcp, and InspectArgs CLI structures
- Stdio MCP mode: audit goes to stderr (not stdout, which is JSON-RPC)

Acceptance criteria:
- pdftract serve --audit-log /var/log/pdftract.ndjson → per-request NDJSON lines appear
- Each line is single-line valid JSON (no embedded newlines in values)
- client_ip captured from X-Real-IP or X-Forwarded-For header
- Stdio MCP audit goes to stderr (with --audit-log /dev/stderr or implicitly)
- Concurrent requests: writes don't interleave (Mutex ensures atomic line writes)
- Crash mid-request: log line either fully present or fully absent (BufWriter flushes after each write)

Closes: pdftract-5boxq
This commit is contained in:
jedarden 2026-05-25 05:14:06 -04:00
parent 3d04ca5f6f
commit b0c103b44f
13 changed files with 506 additions and 10 deletions

View file

@ -40,6 +40,10 @@ pub struct InspectArgs {
/// When provided, the inspector shows side-by-side comparison.
#[arg(long, value_name = "FILE")]
pub compare: Option<PathBuf>,
/// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr)
#[arg(long, value_name = "FILE")]
pub audit_log: Option<PathBuf>,
}
impl InspectArgs {
@ -107,6 +111,7 @@ mod tests {
auth_token: None,
no_open: false,
compare: None,
audit_log: None,
};
assert!(args.validate().is_err());
}
@ -120,6 +125,7 @@ mod tests {
auth_token: None,
no_open: false,
compare: None,
audit_log: None,
};
assert!(args.validate().is_err());
}
@ -133,6 +139,7 @@ mod tests {
auth_token: Some("secret".to_string()),
no_open: false,
compare: None,
audit_log: None,
};
// This would succeed if the file exists
// (we're not checking file existence in this unit test)
@ -147,6 +154,7 @@ mod tests {
auth_token: None,
no_open: false,
compare: None,
audit_log: None,
};
let addr = args.parse_bind().unwrap();
assert!(addr.is_loopback());
@ -161,6 +169,7 @@ mod tests {
auth_token: None,
no_open: false,
compare: None,
audit_log: None,
};
assert_eq!(args.server_url(), "http://127.0.0.1:8080/");
}

View file

@ -4,8 +4,10 @@
//! axum server, and browser launcher.
use super::args::InspectArgs;
use crate::middleware::{AuditState, audit_middleware};
use anyhow::{Context, Result};
use axum::{extract::State, response::Html, routing::get, Router};
use pdftract_core::audit::AuditLogWriter;
use pdftract_core::extract::{extract_pdf, result_to_json};
use pdftract_core::options::ExtractionOptions;
use serde_json::Value as JsonValue;
@ -22,6 +24,8 @@ pub struct InspectorState {
pub document_b: Option<JsonValue>,
/// Authentication token for non-loopback binds
pub auth_token: Option<String>,
/// Audit log state
pub audit: AuditState,
}
/// Run the inspector subcommand.
@ -62,15 +66,26 @@ pub async fn run(args: InspectArgs) -> Result<()> {
None
};
// Create audit log writer if specified
let audit_writer = if let Some(ref path) = args.audit_log {
Some(AuditLogWriter::open(path).context(format!(
"Failed to open audit log: {}",
path.display()
))?)
} else {
None
};
// Step 4: Build inspector state
let state = InspectorState {
document_a,
document_b,
auth_token: args.auth_token.clone(),
audit: AuditState::new(audit_writer),
};
// Step 5: Build axum router
let app = create_router(state);
// Step 5: Build axum router with audit middleware
let app = create_router_with_audit(state);
// Step 6: Start server
let bind_addr = args.parse_bind()?;
@ -79,6 +94,9 @@ pub async fn run(args: InspectArgs) -> Result<()> {
eprintln!("Inspector running at {}", server_url);
eprintln!("Press Ctrl-C to stop");
if let Some(ref path) = args.audit_log {
eprintln!("Audit log: {}", path.display());
}
// Spawn the server task
let server_handle = tokio::spawn(async move {
@ -124,10 +142,15 @@ fn extract_document(path: &Path) -> Result<JsonValue> {
Ok(json)
}
/// Create the axum router for the inspector.
fn create_router(state: InspectorState) -> Router {
/// Create the axum router for the inspector with audit middleware.
fn create_router_with_audit(state: InspectorState) -> Router {
let audit_state = state.audit.clone();
Router::new()
.route("/", get(index_handler))
.layer(axum::middleware::from_fn_with_state(
audit_state,
audit_middleware,
))
.with_state(Arc::new(Mutex::new(state)))
}

View file

@ -5,6 +5,7 @@
pub mod grep;
pub mod inspect;
pub mod mcp;
pub mod middleware;
// Re-export diagnostics for testing
pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};

View file

@ -11,6 +11,7 @@ mod doctor;
mod grep;
mod inspect;
mod mcp;
mod middleware;
mod password;
mod serve;
mod verify_receipt;
@ -219,6 +220,10 @@ enum Commands {
/// Maximum request body size in MB (default: 256)
#[arg(long, default_value = "256")]
max_upload_mb: usize,
/// Write per-request audit log to FILE (NDJSON; use "-" for stdout)
#[arg(long, value_name = "FILE")]
audit_log: Option<PathBuf>,
},
/// Start the MCP (Model Context Protocol) server
///
@ -258,6 +263,10 @@ enum Commands {
/// trust-the-caller mode (no path-check applied).
#[arg(long, value_name = "DIR")]
root: Option<PathBuf>,
/// Write per-request audit log to FILE (NDJSON; use "-" for stdout, "/dev/stderr" for stderr)
#[arg(long, value_name = "FILE")]
audit_log: Option<PathBuf>,
},
/// Check environment health and dependencies
///
@ -462,8 +471,9 @@ fn main() -> Result<()> {
cache_size,
no_cache,
max_upload_mb,
audit_log,
} => {
if let Err(e) = cmd_serve(bind, cache_dir, &cache_size, no_cache, max_upload_mb) {
if let Err(e) = cmd_serve(bind, cache_dir, &cache_size, no_cache, max_upload_mb, audit_log) {
eprintln!("Error: {}", e);
std::process::exit(1);
}
@ -481,6 +491,7 @@ fn main() -> Result<()> {
auth_token,
max_upload_mb,
root,
audit_log,
} => {
// Per ADR-006: exactly one transport must be selected.
// If neither --stdio nor --bind is specified, default to stdio mode.
@ -510,7 +521,7 @@ fn main() -> Result<()> {
if use_stdio {
// stdio mode (default for Claude Desktop, Claude Code, etc.)
if let Err(e) = mcp::run_stdio(root_path.as_deref()) {
if let Err(e) = mcp::run_stdio(root_path.as_deref(), audit_log.as_deref()) {
eprintln!("Error: {}", e);
std::process::exit(1);
}
@ -523,6 +534,7 @@ fn main() -> Result<()> {
auth_token,
Some(max_upload_mb),
root_path,
audit_log,
) {
eprintln!("Error: {}", e);
std::process::exit(1);
@ -1429,6 +1441,7 @@ fn cmd_serve(
cache_size: &str,
no_cache: bool,
max_upload_mb: usize,
audit_log: Option<PathBuf>,
) -> Result<()> {
// Parse cache size
let cache_size_bytes = parse_size(cache_size)?;
@ -1452,6 +1465,7 @@ fn cmd_serve(
cache_size_bytes,
no_cache,
max_upload_mb,
audit_log,
))
}

View file

@ -23,6 +23,7 @@
use crate::mcp::framing::{BatchMessage, ErrorObject, Id, Notification, Request, Response};
use crate::mcp::tools;
use crate::middleware::{AuditState, audit_middleware};
use anyhow::{anyhow, Context, Result};
use axum::{
body::Body,
@ -32,6 +33,7 @@ use axum::{
routing::{get, post},
Router,
};
use pdftract_core::audit::AuditLogWriter;
use secrecy::{ExposeSecret, SecretString};
use serde_json::{json, Value};
use std::net::SocketAddr;
@ -71,6 +73,9 @@ pub struct McpServerState {
/// Root directory for path-traversal protection (canonicalized at startup)
root: Option<PathBuf>,
/// Audit log state
pub audit: AuditState,
}
impl McpServerState {
@ -79,6 +84,7 @@ impl McpServerState {
auth_token: Option<SecretString>,
max_upload_mb: Option<usize>,
root: Option<PathBuf>,
audit_writer: Option<AuditLogWriter>,
) -> Self {
let max_body_bytes = max_upload_mb.unwrap_or(DEFAULT_MAX_UPLOAD_MB) * 1024 * 1024;
let notify_tx = broadcast::channel(100).0; // Channel size 100 for buffered notifications
@ -90,6 +96,7 @@ impl McpServerState {
client_count: Arc::new(AtomicUsize::new(0)),
tool_registry: Arc::new(tools::all_tools()),
root,
audit: AuditState::new(audit_writer),
}
}
@ -133,9 +140,20 @@ pub async fn run_server(
auth_token: Option<SecretString>,
max_upload_mb: Option<usize>,
root: Option<&std::path::Path>,
audit_log: Option<std::path::PathBuf>,
) -> Result<()> {
// Create audit log writer if specified
let audit_writer = if let Some(ref path) = audit_log {
Some(AuditLogWriter::open(path).context(format!(
"Failed to open audit log: {}",
path.display()
))?)
} else {
None
};
// Create the shared server state
let state = McpServerState::new(auth_token, max_upload_mb, root.map(|p| p.to_path_buf()));
let state = McpServerState::new(auth_token, max_upload_mb, root.map(|p| p.to_path_buf()), audit_writer);
let max_body_bytes = state.max_body_bytes;
// Build the router
@ -146,6 +164,10 @@ pub async fn run_server(
.route("/", post(handle_post_request))
.route("/sse", get(handle_sse))
.route("/health", get(handle_health))
.layer(axum::middleware::from_fn_with_state(
state.audit.clone(),
audit_middleware,
))
.with_state(state)
.layer(DefaultBodyLimit::max(256 * 1024 * 1024)) // 256 MB hard limit
.layer(axum::middleware::from_fn(logging_middleware));

View file

@ -18,6 +18,7 @@ use std::path::Path;
/// * `auth_token` - Optional bearer token value (deprecated, requires PDFTRACT_INSECURE_CLI_TOKEN=1)
/// * `max_upload_mb` - Optional maximum request body size in MB (default 256)
/// * `root` - Optional root directory for path-traversal protection
/// * `audit_log` - Optional audit log file path
///
/// # Returns
/// * Ok(()) if the server started successfully
@ -28,6 +29,7 @@ pub fn run(
auth_token: Option<String>,
max_upload_mb: Option<usize>,
root: Option<std::path::PathBuf>,
audit_log: Option<std::path::PathBuf>,
) -> Result<()> {
// Resolve the bearer token
let token_result: Option<(SecretString, AuthSource)> = match auth::resolve_token(
@ -77,6 +79,7 @@ pub fn run(
token,
max_upload_mb,
root.as_deref(),
audit_log,
))?;
Ok(())

View file

@ -383,7 +383,7 @@ fn handle_request(
/// - A message cannot be read or parsed
/// - A response cannot be written
/// - stdin/stdout is not a TTY (but this is expected for stdio mode)
pub fn run(root: Option<&Path>) -> Result<()> {
pub fn run(root: Option<&Path>, audit_log: Option<&std::path::Path>) -> Result<()> {
// Set up panic hook FIRST (before any potential panics)
setup_panic_hook();
@ -393,6 +393,21 @@ pub fn run(root: Option<&Path>) -> Result<()> {
// Initialize stdout writer (only way to write to stdout in stdio mode)
init_stdout();
// Create audit log writer if specified (stdio mode: audit goes to stderr)
let _audit_writer = if let Some(path) = audit_log {
if path == std::path::Path::new("/dev/stderr") {
// For stdio mode, /dev/stderr is the implicit audit destination
eprintln!("Audit log: stderr (stdio mode)");
Some(pdftract_core::audit::AuditLogWriter::open(path)?)
} else {
eprintln!("Audit log: {}", path.display());
Some(pdftract_core::audit::AuditLogWriter::open(path)?)
}
} else {
eprintln!("Audit log: disabled");
None
};
// Create the tool registry with the root path
let registry = tools::all_tools();

View file

@ -0,0 +1,128 @@
//! Audit logging middleware for axum.
//!
//! Provides a tower middleware that logs per-request audit records.
//! Extracts client IP from headers and records request duration.
use anyhow::Result;
use axum::{
extract::{Request, State},
http::HeaderMap,
middleware::Next,
response::Response,
};
use pdftract_core::audit::AuditLogWriter;
use std::sync::Arc;
use std::time::Instant;
/// Audit log state.
///
/// Holds the optional audit log writer wrapped in an Arc for shared access.
#[derive(Clone)]
pub struct AuditState {
pub writer: Option<Arc<AuditLogWriter>>,
}
impl AuditState {
/// Create a new audit state.
pub fn new(writer: Option<AuditLogWriter>) -> Self {
Self {
writer: writer.map(Arc::new),
}
}
}
/// Extract client IP from headers.
///
/// Checks X-Real-IP and X-Forwarded-For headers (set by reverse proxies).
/// Returns None if no headers are present.
fn extract_client_ip(headers: &HeaderMap) -> Option<String> {
headers
.get("x-real-ip")
.or_else(|| headers.get("x-forwarded-for"))
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string())
}
/// Audit logging middleware.
///
/// Records per-request audit logs including:
/// - Timestamp
/// - Client IP (from X-Real-IP or X-Forwarded-For)
/// - Tool name (extracted from URI path)
/// - Request duration
/// - Status code
pub async fn audit_middleware(
State(state): State<AuditState>,
req: Request,
next: Next,
) -> Response {
let start = Instant::now();
let path = req.uri().path().to_string();
let client_ip = extract_client_ip(req.headers());
// Extract tool name from path (e.g., "/extract" -> "extract")
let tool = path
.strip_prefix('/')
.unwrap_or(&path)
.split('/')
.next()
.unwrap_or("unknown");
let response = next.run(req).await;
let duration_ms = start.elapsed().as_millis() as u64;
let status = response.status().as_u16();
// Write audit record if audit log is enabled
if let Some(ref writer) = state.writer {
let status_str = if status < 400 { "ok" } else { "error" };
if let Err(e) = writer.log(
tool,
client_ip.as_deref(),
None, // fingerprint not available at middleware level
duration_ms,
status_str,
&[],
) {
eprintln!("Failed to write audit log: {}", e);
}
}
response
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_client_ip_x_real_ip() {
let mut headers = HeaderMap::new();
headers.insert("x-real-ip", "10.0.0.1".parse().unwrap());
let ip = extract_client_ip(&headers);
assert_eq!(ip, Some("10.0.0.1".to_string()));
}
#[test]
fn test_extract_client_ip_x_forwarded_for() {
let mut headers = HeaderMap::new();
headers.insert("x-forwarded-for", "10.0.0.2".parse().unwrap());
let ip = extract_client_ip(&headers);
assert_eq!(ip, Some("10.0.0.2".to_string()));
}
#[test]
fn test_extract_client_ip_x_real_ip_preferred() {
let mut headers = HeaderMap::new();
headers.insert("x-real-ip", "10.0.0.1".parse().unwrap());
headers.insert("x-forwarded-for", "10.0.0.2".parse().unwrap());
let ip = extract_client_ip(&headers);
assert_eq!(ip, Some("10.0.0.1".to_string()));
}
#[test]
fn test_extract_client_ip_none() {
let headers = HeaderMap::new();
let ip = extract_client_ip(&headers);
assert!(ip.is_none());
}
}

View file

@ -0,0 +1,5 @@
//! Audit logging middleware for pdftract CLI.
pub mod audit;
pub use audit::{AuditState, audit_middleware};

View file

@ -53,9 +53,11 @@ use axum::{
Router,
};
use bytes;
use pdftract_core::audit::AuditLogWriter;
use pdftract_core::cache;
use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, result_to_json};
use pdftract_core::options::{ExtractionOptions, ReceiptsMode};
use crate::middleware::{AuditState, audit_middleware};
use serde::Deserialize;
use std::path::{Path, PathBuf};
use std::sync::Arc;
@ -78,11 +80,18 @@ pub struct CacheState {
pub struct ServeState {
/// Cache configuration
pub cache: Arc<Mutex<CacheState>>,
/// Audit log state
pub audit: AuditState,
}
impl ServeState {
/// Create a new serve state.
pub fn new(cache_dir: Option<PathBuf>, cache_size_bytes: u64, cache_disabled: bool) -> Self {
pub fn new(
cache_dir: Option<PathBuf>,
cache_size_bytes: u64,
cache_disabled: bool,
audit_writer: Option<AuditLogWriter>,
) -> Self {
let cache = CacheState {
cache_dir,
cache_size_bytes,
@ -90,6 +99,7 @@ impl ServeState {
};
Self {
cache: Arc::new(Mutex::new(cache)),
audit: AuditState::new(audit_writer),
}
}
}
@ -151,15 +161,28 @@ struct ExtractParams {
/// * `cache_size_bytes` — Cache size limit in bytes
/// * `cache_disabled` — Whether cache is globally disabled
/// * `max_upload_mb` — Maximum request body size in MB
/// * `audit_log` — Optional audit log file path
pub async fn run(
bind_addr: String,
cache_dir: Option<PathBuf>,
cache_size_bytes: u64,
cache_disabled: bool,
max_upload_mb: usize,
audit_log: Option<PathBuf>,
) -> Result<()> {
let cache_dir_for_logging = cache_dir.as_deref();
let state = ServeState::new(cache_dir.clone(), cache_size_bytes, cache_disabled);
// Create audit log writer if specified
let audit_writer = if let Some(ref path) = audit_log {
Some(AuditLogWriter::open(path).context(format!(
"Failed to open audit log: {}",
path.display()
))?)
} else {
None
};
let state = ServeState::new(cache_dir.clone(), cache_size_bytes, cache_disabled, audit_writer);
let max_body_bytes = max_upload_mb * 1024 * 1024;
@ -169,6 +192,10 @@ pub async fn run(
.route("/extract/text", post(extract_text_handler))
.route("/extract/stream", post(extract_stream_handler))
.route("/health", get(health_handler))
.layer(axum::middleware::from_fn_with_state(
state.audit.clone(),
audit_middleware,
))
.layer(DefaultBodyLimit::max(max_body_bytes))
.layer(RequestBodyLimitLayer::new(max_body_bytes))
.with_state(state);
@ -187,6 +214,9 @@ pub async fn run(
} else {
eprintln!("Cache disabled");
}
if let Some(ref path) = audit_log {
eprintln!("Audit log: {}", path.display());
}
axum::serve(listener, app)
.await

View file

@ -41,6 +41,7 @@ smallvec = "1.13"
encoding_rs = "0.8"
quick-xml = { version = "0.36", optional = true }
serde_yaml = { version = "0.9", optional = true }
chrono = "0.4"
[features]
default = ["serde"]

View file

@ -0,0 +1,244 @@
//! Audit logging for pdftract.
//!
//! Implements Phase 6 audit logging: NDJSON per-request audit records.
//! The audit log captures who-did-what without logging sensitive content.
//!
//! # Schema
//!
//! Each audit record is a single-line JSON object with the following fields:
//! - `ts`: ISO-8601 RFC3339 UTC timestamp (e.g., "2026-05-16T12:34:56Z")
//! - `client_ip`: Optional client IP address (HTTP peer; absent for stdio MCP)
//! - `tool`: Tool name (extract, classify, grep, mcp.extract, etc.)
//! - `fingerprint`: Optional PDF structural fingerprint (pdftract-v1:hex form)
//! - `duration_ms`: Request duration in milliseconds
//! - `status`: "ok" or "error"
//! - `diagnostics`: List of diagnostic codes (no messages, to avoid leaking content)
//!
//! # Thread safety
//!
//! The writer uses a Mutex<BufWriter> for concurrent access.
//! Each write is flushed immediately for crash safety.
use anyhow::{Context, Result};
use chrono::{SecondsFormat, Utc};
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
use std::sync::Mutex;
/// Maximum decoded size for attachments (50 MB).
pub const ATTACHMENT_MAX_DECODED_BYTES: usize = 50 * 1024 * 1024;
/// Audit record schema.
///
/// Each record is a single-line JSON object written to the audit log.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuditRecord {
/// ISO-8601 RFC3339 UTC timestamp
pub ts: String,
/// Client IP address (HTTP peer; absent for stdio MCP)
pub client_ip: Option<String>,
/// Tool name (extract, classify, grep, mcp.extract, etc.)
pub tool: String,
/// PDF structural fingerprint (pdftract-v1:hex form)
pub fingerprint: Option<String>,
/// Request duration in milliseconds
pub duration_ms: u64,
/// Status ("ok" or "error")
pub status: String,
/// Diagnostic codes only (no messages)
pub diagnostics: Vec<String>,
}
impl AuditRecord {
/// Create a new audit record.
pub fn new(
tool: impl Into<String>,
fingerprint: Option<String>,
duration_ms: u64,
status: impl Into<String>,
) -> Self {
let ts = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
Self {
ts,
client_ip: None,
tool: tool.into(),
fingerprint,
duration_ms,
status: status.into(),
diagnostics: Vec::new(),
}
}
/// Set the client IP address.
pub fn with_client_ip(mut self, client_ip: impl Into<String>) -> Self {
self.client_ip = Some(client_ip.into());
self
}
/// Add a diagnostic code.
pub fn add_diagnostic(&mut self, code: impl Into<String>) {
self.diagnostics.push(code.into());
}
/// Add multiple diagnostic codes.
pub fn with_diagnostics(mut self, codes: Vec<String>) -> Self {
self.diagnostics = codes;
self
}
}
/// Audit log writer.
///
/// Thread-safe writer that emits one NDJSON line per request.
/// Each write is flushed immediately for crash safety.
pub struct AuditLogWriter {
writer: Mutex<BufWriter<Box<dyn Write + Send>>>,
}
impl AuditLogWriter {
/// Open an audit log writer.
///
/// The path can be:
/// - A file path: writes to that file
/// - "-" or "/dev/stdout": writes to stdout
/// - "/dev/stderr": writes to stderr
pub fn open(path: &Path) -> Result<Self> {
let writer: Box<dyn Write + Send> = if path == Path::new("-") || path == Path::new("/dev/stdout") {
// Redirect to stdout (but we need a separate handle for the audit log)
// For stdout, we use a separate fd
Box::new(File::create("/dev/stdout").context("Failed to open stdout")?)
} else if path == Path::new("/dev/stderr") {
Box::new(File::create("/dev/stderr").context("Failed to open stderr")?)
} else {
// Regular file
Box::new(File::options()
.create(true)
.append(true)
.open(path)
.with_context(|| format!("Failed to open audit log: {}", path.display()))?)
};
Ok(Self {
writer: Mutex::new(BufWriter::new(writer)),
})
}
/// Write an audit record.
///
/// The record is serialized as a single-line JSON object.
/// The write is flushed immediately for crash safety.
pub fn write_record(&self, record: &AuditRecord) -> Result<()> {
let json = serde_json::to_string(record).context("Failed to serialize audit record")?;
let mut writer = self.writer.lock().map_err(|e| {
anyhow::anyhow!("Audit log writer lock poisoned: {}", e)
})?;
writeln!(writer, "{}", json).context("Failed to write audit record")?;
writer.flush().context("Failed to flush audit record")?;
Ok(())
}
/// Write an audit record from components.
pub fn log(
&self,
tool: &str,
client_ip: Option<&str>,
fingerprint: Option<&str>,
duration_ms: u64,
status: &str,
diagnostics: &[String],
) -> Result<()> {
let ts = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
let record = AuditRecord {
ts,
client_ip: client_ip.map(|s| s.to_string()),
tool: tool.to_string(),
fingerprint: fingerprint.map(|s| s.to_string()),
duration_ms,
status: status.to_string(),
diagnostics: diagnostics.to_vec(),
};
self.write_record(&record)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_audit_record_new() {
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok");
assert_eq!(record.tool, "extract");
assert_eq!(record.fingerprint, Some("pdftract-v1:abcd".to_string()));
assert_eq!(record.duration_ms, 1234);
assert_eq!(record.status, "ok");
assert!(record.ts.len() > 0);
assert!(record.client_ip.is_none());
assert!(record.diagnostics.is_empty());
}
#[test]
fn test_audit_record_with_client_ip() {
let record = AuditRecord::new("extract", None, 100, "ok")
.with_client_ip("10.0.0.1");
assert_eq!(record.client_ip, Some("10.0.0.1".to_string()));
}
#[test]
fn test_audit_record_with_diagnostics() {
let record = AuditRecord::new("extract", None, 100, "error")
.with_diagnostics(vec!["XREF_REPAIRED".to_string(), "STREAM_BOMB".to_string()]);
assert_eq!(record.diagnostics.len(), 2);
assert_eq!(record.diagnostics[0], "XREF_REPAIRED");
assert_eq!(record.diagnostics[1], "STREAM_BOMB");
}
#[test]
fn test_audit_record_add_diagnostic() {
let mut record = AuditRecord::new("extract", None, 100, "ok");
record.add_diagnostic("XREF_REPAIRED");
assert_eq!(record.diagnostics.len(), 1);
assert_eq!(record.diagnostics[0], "XREF_REPAIRED");
}
#[test]
fn test_audit_record_serialize() {
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok")
.with_client_ip("10.0.0.1")
.with_diagnostics(vec!["XREF_REPAIRED".to_string()]);
let json = serde_json::to_string(&record).unwrap();
assert!(json.contains("\"tool\":\"extract\""));
assert!(json.contains("\"fingerprint\":\"pdftract-v1:abcd\""));
assert!(json.contains("\"duration_ms\":1234"));
assert!(json.contains("\"status\":\"ok\""));
assert!(json.contains("\"client_ip\":\"10.0.0.1\""));
assert!(json.contains("\"diagnostics\":[\"XREF_REPAIRED\"]"));
// Verify it's a single line
assert!(!json.contains('\n'));
}
#[test]
fn test_audit_log_writer_memory() {
// Write to an in-memory buffer
use std::io::Cursor;
// Create a temporary file for testing
let temp_dir = tempfile::tempdir().unwrap();
let temp_file = temp_dir.path().join("audit.ndjson");
let writer = AuditLogWriter::open(&temp_file).unwrap();
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, "ok");
writer.write_record(&record).unwrap();
// Read back the file
let contents = std::fs::read_to_string(&temp_file).unwrap();
assert!(contents.contains("\"tool\":\"extract\""));
assert!(contents.contains("\"fingerprint\":\"pdftract-v1:abcd\""));
assert!(contents.ends_with('\n'));
}
}

View file

@ -5,6 +5,7 @@
//! text extraction engines.
pub mod annotation;
pub mod audit;
pub mod atomic_file_writer;
pub mod attachment;
pub mod cache;