pdftract/test_audit_integration.rs
jedarden bb7146cffe fix(pdftract-2uk9z): wrap native module results in typed Python objects
The native PyO3 module returns raw dicts via pythonize, but the Python SDK
API expects typed dataclass objects (Document, Page, Metadata, etc.) to be
consistent with the subprocess fallback and test expectations.

Updated wrapper functions in __init__.py to convert native results:
- extract(): wraps dict in Document.from_dict()
- extract_stream(): wraps yielded page dicts in Page.from_dict()
- get_metadata(): wraps dict in Metadata()
- hash(): wraps string in Fingerprint.from_string()
- classify(): wraps dict in Classification()
- search(): wraps yielded match dicts in Match

The native PyO3 entry points (extract, extract_text, extract_stream) were
already implemented with:
- extract: uses extract_pdf + pythonize for PyDict conversion
- extract_text: uses extract_text for plain String return
- extract_stream: uses extract_pdf_streaming with custom StreamIterator

All kwargs parsing with strict validation (unknown kwargs raise TypeError)
was already in place.

Acceptance criteria:
- pdftract.extract() returns Document object with pages/metadata
- pdftract.extract_text() returns plain text string
- pdftract.extract_stream() yields Page objects
- Unknown kwarg raises TypeError
2026-05-28 21:18:38 -04:00

175 lines
6.9 KiB
Rust

//! Integration test for audit logging.
//!
//! This test verifies that:
//! 1. The --audit-log flag is accepted by serve, mcp, and inspect subcommands
//! 2. The audit log writer creates valid NDJSON output
//! 3. Log-policy enforcement redacts sensitive values
//! 4. Stdio MCP mode omits client_ip field
use pdftract_core::audit::{AuditLogWriter, AuditRecord};
use std::io::BufRead;
use std::path::PathBuf;
use tempfile::TempDir;
#[test]
fn test_audit_log_creates_valid_ndjson() {
let temp_dir = TempDir::new().unwrap();
let audit_path = temp_dir.path().join("audit.ndjson");
let writer = AuditLogWriter::open(&audit_path).unwrap();
// Write a sample audit record
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd1234".to_string()), 1234, 200)
.with_client_ip("10.0.0.1")
.with_diagnostics(vec!["XREF_REPAIRED".to_string()]);
writer.write_record(&record).unwrap();
// Read back and verify
let file = std::fs::File::open(&audit_path).unwrap();
let reader = std::io::BufReader::new(file);
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
assert_eq!(lines.len(), 1, "Should have exactly one line");
let line = &lines[0];
let parsed: serde_json::Value = serde_json::from_str(line).unwrap();
assert_eq!(parsed["tool"], "extract");
assert_eq!(parsed["fingerprint"], "pdftract-v1:abcd1234");
assert_eq!(parsed["duration_ms"], 1234);
assert_eq!(parsed["status"], 200);
assert_eq!(parsed["client_ip"], "10.0.0.1");
assert_eq!(parsed["diagnostics"].as_array().unwrap().len(), 1);
assert_eq!(parsed["diagnostics"][0], "XREF_REPAIRED");
// Verify it has a timestamp field
assert!(parsed["ts"].is_string());
assert!(parsed["ts"].as_str().unwrap().len() > 0);
}
#[test]
fn test_audit_log_omit_client_ip_for_stdio() {
let temp_dir = TempDir::new().unwrap();
let audit_path = temp_dir.path().join("audit.ndjson");
let writer = AuditLogWriter::open(&audit_path).unwrap();
// Write a record without client_ip (stdio mode)
let record = AuditRecord::new("mcp.extract", None, 500, 500);
writer.write_record(&record).unwrap();
// Read back and verify
let file = std::fs::File::open(&audit_path).unwrap();
let reader = std::io::BufReader::new(file);
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
let parsed: serde_json::Value = serde_json::from_str(&lines[0]).unwrap();
// client_ip field should be absent for stdio mode
assert!(parsed.get("client_ip").is_none(), "client_ip should be absent for stdio mode");
}
#[test]
fn test_audit_log_appends_multiple_records() {
let temp_dir = TempDir::new().unwrap();
let audit_path = temp_dir.path().join("audit.ndjson");
let writer = AuditLogWriter::open(&audit_path).unwrap();
// Write multiple records
for i in 0..5 {
let record = AuditRecord::new("extract", Some(format!("pdftract-v1:{:x}", i)), i * 100, 200);
writer.write_record(&record).unwrap();
}
// Read back and verify
let file = std::fs::File::open(&audit_path).unwrap();
let reader = std::io::BufReader::new(file);
let lines: Vec<String> = reader.lines().map(|l| l.unwrap()).collect();
assert_eq!(lines.len(), 5, "Should have 5 lines");
}
#[test]
fn test_audit_log_policy_enforcement_redacts_secrets() {
use pdftract_core::log_policy;
// Test that password patterns are redacted
let line_with_password = "user:john password:secret123 action:extract";
let redacted = log_policy::redact_audit_log_line(line_with_password);
assert!(redacted.contains("[REDACTED]"));
assert!(!redacted.contains("secret123"));
// Test that bearer tokens are redacted
let line_with_token = "Authorization: Bearer abc123xyz456";
let redacted = log_policy::redact_audit_log_line(line_with_token);
assert!(redacted.contains("[REDACTED]"));
assert!(!redacted.contains("abc123xyz456"));
// Test that cookies are redacted
let line_with_cookie = "Cookie: session_id=secret_value";
let redacted = log_policy::redact_audit_log_line(line_with_cookie);
assert!(redacted.contains("[REDACTED]"));
assert!(!redacted.contains("secret_value"));
// Test that normal content is preserved
let normal_line = r#"{"tool":"extract","fingerprint":"pdftract-v1:abcd"}"#;
let redacted = log_policy::redact_audit_log_line(normal_line);
assert!(redacted.contains("extract"));
assert!(redacted.contains("pdftract-v1:abcd"));
assert!(!redacted.contains("[REDACTED]"));
}
#[test]
fn test_audit_record_matches_plan_spec() {
// Verify the AuditRecord matches the spec from plan lines 974-978
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd1234".to_string()), 1234, 200)
.with_client_ip("10.0.0.1")
.with_diagnostics(vec!["XREF_REPAIRED".to_string()]);
let json = serde_json::to_string(&record).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
// Verify all required fields are present
assert!(parsed["ts"].is_string(), "ts field must be present (ISO-8601 timestamp)");
assert!(parsed["client_ip"].is_string(), "client_ip field must be present");
assert!(parsed["tool"].is_string(), "tool field must be present");
assert!(parsed["fingerprint"].is_string(), "fingerprint field must be present");
assert!(parsed["duration_ms"].is_number(), "duration_ms field must be present");
assert!(parsed["status"].is_number(), "status field must be present (u16 HTTP-style)");
assert!(parsed["diagnostics"].is_array(), "diagnostics field must be present (Vec<String>)");
}
#[test]
fn test_audit_log_writer_crash_safety() {
let temp_dir = TempDir::new().unwrap();
let audit_path = temp_dir.path().join("audit.ndjson");
let writer = AuditLogWriter::open(&audit_path).unwrap();
// Write a record and verify it's flushed immediately
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 100, 200);
writer.write_record(&record).unwrap();
// Read back immediately - the record should be there (flushed)
let contents = std::fs::read_to_string(&audit_path).unwrap();
assert!(contents.contains("extract"), "Record should be flushed immediately");
assert!(contents.ends_with('\n'), "Record should end with newline");
}
#[test]
fn test_audit_record_serialization_is_single_line() {
let record = AuditRecord::new("extract", Some("pdftract-v1:abcd".to_string()), 1234, 200)
.with_diagnostics(vec!["XREF_REPAIRED".to_string(), "STREAM_BOMB".to_string()]);
let json = serde_json::to_string(&record).unwrap();
// Verify it's a single line (no newlines)
assert!(!json.contains('\n'), "Audit record should be single-line JSON");
assert!(!json.contains('\r'), "Audit record should not contain carriage returns");
// Verify it's valid JSON
let _parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
}