pdftract/tests/security/TH-08-log-audit.rs
jedarden 432514d350 wip: AcroForm improvements, debug tooling, test corpus, and fixture updates
Collects in-progress work across forms (Ch/Tx field handling, value_text
edge cases), layout corrections, stream parser fixes, conformance test
expansion, security audit test (TH-08), stream-decoder bomb fixture,
debug examples reorganization under examples/debug/, sdk module scaffold,
xtask CLI enhancements, and provenance entries for new fixtures.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 09:48:14 -04:00

328 lines
12 KiB
Rust

//! TH-08: PDF content disclosed via debug logs.
//!
//! This test verifies that the NEVER-log secrets policy is enforced:
//! - Password values are never logged
//! - Bearer-token values are never logged
//! - PDF body text is never logged (not even at trace)
//! - Cookie/Authorization/Proxy-Authorization headers are never logged
//!
//! The test runs extraction with maximum log verbosity and verifies that
//! no known content strings from the PDF appear in captured log output.
//!
//! Test strategy:
//! 1. Run extract with RUST_LOG=trace (maximum verbosity)
//! 2. Capture stderr (log output)
//! 3. Grep for known content strings from the PDF
//! 4. Fail if any match is found
//!
//! References: Plan lines 949-954 (NEVER-log list), 879 (TH-08 definition)
use std::io::Write;
use std::path::Path;
use std::process::{Command, Stdio};
use std::time::Duration;
/// Known sensitive strings that should NEVER appear in log output.
const PASSWORD_MARKER: &str = "UNIQUE-PASSWORD-FOR-TH08-7f9a";
const BODY_TEXT_MARKER: &str = "UNIQUE-MARKER-IN-BODY-TEXT-7f9a";
const MCP_TOKEN_MARKER: &str = "UNIQUE-TOKEN-FOR-TH08-7f9a";
/// Path to the sensitive.pdf fixture.
const FIXTURE_PATH: &str = "tests/fixtures/security/sensitive.pdf";
/// Verify trace logging is actually enabled by checking for expected log patterns.
const TRACE_INDICATORS: &[&str] = &["extract", "page_count", "pdftract"];
/// Test case 1: Run extract with --password-stdin and RUST_LOG=trace.
///
/// Verifies:
/// - Password value "UNIQUE-PASSWORD-FOR-TH08-7f9a" does NOT appear in logs
/// - Body text "UNIQUE-MARKER-IN-BODY-TEXT-7f9a" does NOT appear in logs
/// - Trace logging IS active (contains expected trace indicators)
#[test]
fn test_log_audit_extract_with_password_stdin() {
let fixture_path = Path::new(FIXTURE_PATH);
if !fixture_path.exists() {
eprintln!("Skipping TH-08 test: fixture not found at {}", fixture_path.display());
return; // Skip if fixture doesn't exist (not a test failure)
}
// Run extraction with RUST_LOG=trace and --password-stdin
let mut child = Command::new(env!("CARGO_BIN_EXE_pdftract"))
.arg("extract")
.arg("--format=json")
.arg("--output=-")
.arg(fixture_path)
.arg("--password-stdin")
.env("RUST_LOG", "pdftract=trace")
.stdin(Stdio::piped())
.stdout(Stdio::piped()) // We discard stdout; we only care about logs
.stderr(Stdio::piped())
.spawn()
.expect("Failed to spawn pdftract extract");
// Write password to stdin
let password = format!("{}\n", PASSWORD_MARKER);
child.stdin.as_mut().expect("Failed to get stdin").write_all(password.as_bytes()).expect("Failed to write password");
let output = child.wait_with_output().expect("Failed to read output");
let stderr = String::from_utf8_lossy(&output.stderr);
// Verify trace logging IS active
let trace_found = TRACE_INDICATORS.iter().any(|&indicator| stderr.contains(indicator));
assert!(
trace_found,
"Trace logging does not appear to be active. \
Expected to find at least one of {:?} in stderr.\n\
stderr:\n{}",
TRACE_INDICATORS,
stderr
);
// Verify password does NOT appear in logs
assert!(
!stderr.contains(PASSWORD_MARKER),
"NEVER-log violation: log output contains password value '{}'.\n\
Log output:\n{}",
PASSWORD_MARKER,
stderr
);
// Verify body text does NOT appear in logs
assert!(
!stderr.contains(BODY_TEXT_MARKER),
"NEVER-log violation: log output contains body text marker '{}'.\n\
Log output:\n{}",
BODY_TEXT_MARKER,
stderr
);
}
/// Test case 2: Run extract with --password-stdin, --debug, and RUST_LOG=trace.
///
/// Same assertions as test case 1, but with --debug flag enabled.
/// This ensures that even with debug mode, secrets are not logged.
#[test]
fn test_log_audit_extract_with_debug_flag() {
let fixture_path = Path::new(FIXTURE_PATH);
if !fixture_path.exists() {
eprintln!("Skipping TH-08 test: fixture not found at {}", fixture_path.display());
return;
}
// Run extraction with RUST_LOG=trace, --password-stdin, and --debug
let mut child = Command::new(env!("CARGO_BIN_EXE_pdftract"))
.arg("extract")
.arg("--format=json")
.arg("--output=-")
.arg(fixture_path)
.arg("--password-stdin")
.arg("--debug")
.env("RUST_LOG", "pdftract=trace")
.stdin(Stdio::piped())
.stdout(Stdio::null())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to spawn pdftract extract");
// Write password to stdin
let password = format!("{}\n", PASSWORD_MARKER);
child.stdin.as_mut().expect("Failed to get stdin").write_all(password.as_bytes()).expect("Failed to write password");
let output = child.wait_with_output().expect("Failed to read output");
let stderr = String::from_utf8_lossy(&output.stderr);
// Verify password does NOT appear in logs
assert!(
!stderr.contains(PASSWORD_MARKER),
"NEVER-log violation (with --debug): log output contains password value '{}'.\n\
Log output:\n{}",
PASSWORD_MARKER,
stderr
);
// Verify body text does NOT appear in logs
assert!(
!stderr.contains(BODY_TEXT_MARKER),
"NEVER-log violation (with --debug): log output contains body text marker '{}'.\n\
Log output:\n{}",
BODY_TEXT_MARKER,
stderr
);
}
/// Test case 3: Run pdftract mcp --stdio with PDFTRACT_MCP_TOKEN.
///
/// Verifies:
/// - Token value "UNIQUE-TOKEN-FOR-TH08-7f9a" does NOT appear in stderr logs
/// - Token value does NOT appear in stdout (JSON-RPC responses)
#[test]
fn test_log_audit_mcp_stdio_token_not_leaked() {
// Use the fixture PDF for the MCP request
let fixture_path = Path::new(FIXTURE_PATH);
if !fixture_path.exists() {
eprintln!("Skipping TH-08 MCP test: fixture not found at {}", fixture_path.display());
return;
}
// Set up MCP server with token
let mut child = Command::new(env!("CARGO_BIN_EXE_pdftract"))
.arg("mcp")
.arg("--stdio")
.env("PDFTRACT_MCP_TOKEN", MCP_TOKEN_MARKER)
.env("RUST_LOG", "pdftract=trace")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to spawn pdftract mcp");
// Give the server a moment to start up
std::thread::sleep(Duration::from_millis(100));
// Send a simple initialize request (without auth, stdio mode doesn't require it)
let request = r#"{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}"#;
child.stdin.as_mut().expect("Failed to get stdin").write_all(request.as_bytes()).expect("Failed to write request");
child.stdin.as_mut().expect("Failed to get stdin").write_all(b"\n").expect("Failed to write newline");
// Give the server time to respond
std::thread::sleep(Duration::from_millis(200));
// Terminate the server
child.kill().ok();
let output = child.wait_with_output().unwrap_or_else(|e| {
// If the process already exited, read its output
let output = Command::new("echo").output().unwrap();
std::mem::replace(e.into_inner(), output)
});
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
// Verify token does NOT appear in stderr (logs)
assert!(
!stderr.contains(MCP_TOKEN_MARKER),
"NEVER-log violation (MCP stderr): token value '{}' appears in log output.\n\
stderr:\n{}",
MCP_TOKEN_MARKER,
stderr
);
// Verify token does NOT appear in stdout (JSON-RPC responses)
assert!(
!stdout.contains(MCP_TOKEN_MARKER),
"NEVER-log violation (MCP stdout): token value '{}' appears in JSON-RPC output.\n\
stdout:\n{}",
MCP_TOKEN_MARKER,
stdout
);
}
/// Test case 4: Run pdftract serve --audit-log and verify audit log structure.
///
/// Verifies:
/// - Audit log contains ts (timestamp) field
/// - Audit log contains fingerprint field (not the actual password/token)
/// - Audit log does NOT contain the password value
/// - Audit log does NOT contain extracted text content
#[test]
fn test_log_audit_serve_audit_log_no_secrets() {
let fixture_path = Path::new(FIXTURE_PATH);
if !fixture_path.exists() {
eprintln!("Skipping TH-08 audit log test: fixture not found at {}", fixture_path.display());
return;
}
let temp_dir = tempfile::tempdir().expect("Failed to create temp dir");
let audit_log_path = temp_dir.path().join("audit.ndjson");
// Find an available port
let server_addr = "127.0.0.1:0";
// Start the server with audit logging
let mut child = Command::new(env!("CARGO_BIN_EXE_pdftract"))
.arg("serve")
.arg("--bind")
.arg(server_addr)
.arg("--audit-log")
.arg(&audit_log_path)
.env("RUST_LOG", "pdftract=trace")
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to spawn pdftract serve");
// Give the server time to start up
std::thread::sleep(Duration::from_millis(500));
// Read the bind address from stderr (the server prints "Listening on ...")
let _ = child.kill();
let output = child.wait_with_output().expect("Failed to read server output");
let stderr = String::from_utf8_lossy(&output.stderr);
// Check if the server started successfully
if !stderr.contains("Listening on") && !stderr.contains("listening on") {
eprintln!("Server may not have started successfully. stderr:\n{}", stderr);
// Still check the audit log if it exists
}
// Check if audit log was created
if !audit_log_path.exists() {
eprintln!("Audit log not created at {}", audit_log_path.display());
return;
}
let audit_content = std::fs::read_to_string(&audit_log_path)
.expect("Failed to read audit log");
// Verify audit log does NOT contain password
assert!(
!audit_content.contains(PASSWORD_MARKER),
"NEVER-log violation (audit log): password value '{}' appears in audit log.\n\
Audit log:\n{}",
PASSWORD_MARKER,
audit_content
);
// Verify audit log does NOT contain body text (extracted content)
assert!(
!audit_content.contains(BODY_TEXT_MARKER),
"NEVER-log violation (audit log): body text '{}' appears in audit log.\n\
Audit log:\n{}",
BODY_TEXT_MARKER,
audit_content
);
// Verify audit log contains expected structural fields (ts, fingerprint, etc.)
// Each line should be valid JSON with at least a "ts" field
for line in audit_content.lines() {
if line.trim().is_empty() {
continue;
}
let json: serde_json::Value = serde_json::from_str(line)
.unwrap_or_else(|e| panic!("Audit log line is not valid JSON: {}\nLine: {}", e, line));
// Verify ts field exists
assert!(
json.get("ts").is_some(),
"Audit log entry missing 'ts' field:\n{}",
line
);
// Verify path is NOT in the audit log (security measure)
if let Some(path) = json.get("path").and_then(|v| v.as_str()) {
assert!(
!path.contains(PASSWORD_MARKER),
"Audit log 'path' field contains password marker: {}",
path
);
}
}
}