pdftract/crates/pdftract-core/tests/conformance.rs
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

712 lines
24 KiB
Rust

//! SDK conformance test suite.
//!
//! This integration test runs the shared SDK conformance suite against pdftract-core.
//! Tests are defined in tests/sdk-conformance/cases.json and cover the SDK contract methods:
//! - extract
//! - extract_text
//! - extract_markdown
//! - extract_stream
//! - search (TODO: not yet implemented in pdftract-core)
//! - get_metadata (TODO: needs public API wrapper)
//! - hash (TODO: needs public API wrapper)
//! - classify (TODO: needs public API wrapper)
//! - verify_receipt (TODO: needs public API wrapper)
//!
//! The test rig enforces the SDK contract: all public methods must exist with the
//! documented signatures and must pass the conformance suite.
use std::fs;
use std::path::{Path, PathBuf};
use anyhow::{anyhow, Result};
use serde::Deserialize;
use serde_json::{Map, Value};
use pdftract_core::extract::{extract_pdf, extract_pdf_ndjson, extract_text, ExtractionOptions, ExtractionResult};
use pdftract_core::markdown::page_to_markdown;
/// Test case loaded from cases.json.
#[derive(Debug, Clone, Deserialize)]
struct TestCase {
id: String,
fixture: String,
method: String,
options: Value,
expected: Value,
tolerances: Option<Value>,
#[serde(default)]
feature: Option<String>,
#[serde(default)]
min_schema_version: Option<String>,
#[serde(default)]
skip_reason: Option<String>,
}
/// The conformance suite structure.
#[derive(Debug, Deserialize)]
struct ConformanceSuite {
version: String,
schema_version: String,
cases: Vec<TestCase>,
}
/// Result of running a single test case.
#[derive(Debug)]
struct TestResult {
id: String,
passed: bool,
skipped: bool,
skip_reason: Option<String>,
errors: Vec<String>,
}
/// Locate the fixture path for a test case.
fn resolve_fixture_path(fixture: &str) -> PathBuf {
// Check if it's a URL
if fixture.starts_with("http://") || fixture.starts_with("https://") {
return PathBuf::from(fixture);
}
// Resolve relative to tests/sdk-conformance/fixtures/
let base = PathBuf::from("tests/sdk-conformance/fixtures");
base.join(fixture)
}
/// Check if a feature is enabled in the current build.
fn is_feature_enabled(feature: &str) -> bool {
match feature {
"vector" => true, // Always enabled
"ocr" => cfg!(feature = "ocr"),
"decrypt" => cfg!(feature = "decrypt"),
"forms" => true, // Always enabled
"mixed" => true,
"large" => true,
"unicode" => true,
"vertical" => true,
"math" => true,
"tables" => true,
"code" => true,
"headings" => true,
"stream" => true,
"search" => true,
"metadata" => true,
"xmp" => cfg!(feature = "quick-xml"),
"hash" => true,
"classify" => cfg!(feature = "profiles"),
"receipt" => cfg!(feature = "receipts"),
"error-handling" => true,
"remote" => cfg!(feature = "remote"),
_ => true,
}
}
/// Build ExtractionOptions from test case options.
fn options_from_value(opts: &Value) -> ExtractionOptions {
let mut options = ExtractionOptions::default();
if let Some(lang) = opts.get("ocr_language").and_then(|v| v.as_str()) {
options.ocr_languages = vec![lang.to_string()];
}
if let Some(threshold) = opts.get("ocr_threshold").and_then(|v| v.as_f64()) {
options.ocr_threshold = threshold as f32;
}
if let Some(preserve) = opts.get("preserve_layout").and_then(|v| v.as_bool()) {
options.output.preserve_layout = preserve;
}
if let Some(extract_images) = opts.get("extract_images").and_then(|v| v.as_bool()) {
options.extract_images = extract_images;
}
if let Some(password) = opts.get("password").and_then(|v| v.as_str()) {
options.decryption_password = Some(password.to_string());
}
options
}
/// Compare a value against expected with tolerances.
fn compare_with_tolerances(actual: &Value, expected: &Value, tolerances: &Value, path: &str) -> Vec<String> {
let mut errors = Vec::new();
match (expected, actual) {
(Value::Object(exp_map), Value::Object(act_map)) => {
for (key, exp_value) in exp_map {
let field_path = if path.is_empty() {
key.clone()
} else {
format!("{}.{}", path, key)
};
if !act_map.contains_key(key) {
errors.push(format!("Missing field: {}", field_path));
continue;
}
let act_value = &act_map[key];
let field_errors = compare_with_tolerances(act_value, exp_value, tolerances, &field_path);
errors.extend(field_errors);
}
}
(Value::Array(exp_arr), Value::Array(act_arr)) => {
// Check length if specified as min/max
if exp_arr.len() == 1 {
let single = &exp_arr[0];
if let Some(min) = single.get("min").and_then(|v| v.as_u64()) {
if act_arr.len() < min as usize {
errors.push(format!(
"{}: Expected at least {} items, got {}",
path,
min,
act_arr.len()
));
}
} else if let Some(max) = single.get("max").and_then(|v| v.as_u64()) {
if act_arr.len() > max as usize {
errors.push(format!(
"{}: Expected at most {} items, got {}",
path,
max,
act_arr.len()
));
}
} else {
// Single value to compare against all elements
for (i, act_elem) in act_arr.iter().enumerate() {
let elem_path = format!("{}[{}]", path, i);
let elem_errors = compare_with_tolerances(act_elem, single, tolerances, &elem_path);
errors.extend(elem_errors);
}
}
} else if exp_arr.len() == 2 {
// Range [min, max]
if let (Some(min), Some(max)) = (
exp_arr[0].as_u64(),
exp_arr[1].as_u64()
) {
let len = act_arr.len() as u64;
if len < min || len > max {
errors.push(format!(
"{}: Expected length in range [{}..{}], got {}",
path,
min,
max,
len
));
}
}
} else {
// Compare element by element
for (i, (exp_elem, act_elem)) in exp_arr.iter().zip(act_arr.iter()).enumerate() {
let elem_path = format!("{}[{}]", path, i);
let elem_errors = compare_with_tolerances(act_elem, exp_elem, tolerances, &elem_path);
errors.extend(elem_errors);
}
}
}
(Value::Number(exp_num), Value::Number(act_num)) => {
let exp_f64 = exp_num.as_f64().unwrap();
let act_f64 = act_num.as_f64().unwrap();
// Check for tolerances for this path
let tolerance = find_tolerance_for_path(tolerances, path);
if let Some(tol) = tolerance {
if let Some(abs_tol) = tol.get("abs").and_then(|v| v.as_f64()) {
let diff = (act_f64 - exp_f64).abs();
if diff > abs_tol {
errors.push(format!(
"{}: Expected {}, got {} (diff {} exceeds abs tolerance {})",
path, exp_num, act_num, diff, abs_tol
));
}
return errors; // Passed tolerance check
}
if let Some(rel_tol) = tol.get("rel").and_then(|v| v.as_f64()) {
let diff = (act_f64 - exp_f64).abs();
let max_diff = rel_tol * exp_f64.abs();
if diff > max_diff {
errors.push(format!(
"{}: Expected {}, got {} (diff {} exceeds rel tolerance {})",
path, exp_num, act_num, diff, max_diff
));
}
return errors; // Passed tolerance check
}
}
// No tolerance, exact match required
if (act_f64 - exp_f64).abs() > f64::EPSILON {
errors.push(format!(
"{}: Expected {}, got {}",
path, exp_num, act_num
));
}
}
(Value::String(exp_str), Value::String(act_str)) => {
if exp_str != act_str {
errors.push(format!(
"{}: Expected '{}', got '{}'",
path, exp_str, act_str
));
}
}
(Value::Bool(exp_bool), Value::Bool(act_bool)) => {
if exp_bool != act_bool {
errors.push(format!(
"{}: Expected {}, got {}",
path, exp_bool, act_bool
));
}
}
(Value::Null, Value::Null) => {
// Null matches null
}
(_, actual) => {
errors.push(format!(
"{}: Type mismatch: expected {}, got {}",
path,
expected_type_name(expected),
actual_type_name(actual)
));
}
}
errors
}
/// Find tolerance for a specific path using wildcard matching.
fn find_tolerance_for_path(tolerances: &Value, path: &str) -> Option<&Value> {
if let Some(tol_obj) = tolerances.as_object() {
// Check for exact match first
if let Some(tol) = tol_obj.get(path) {
return Some(tol);
}
// Check for wildcard patterns
for (pattern, tol) in tol_obj {
if path_matches_pattern(path, pattern) {
return Some(tol);
}
}
}
None
}
/// Check if a path matches a wildcard pattern (e.g., "pages[*].spans[*].bbox").
fn path_matches_pattern(path: &str, pattern: &str) -> bool {
let path_parts: Vec<&str> = path.split('.').collect();
let pattern_parts: Vec<&str> = pattern.split('.').collect();
if path_parts.len() != pattern_parts.len() {
return false;
}
for (path_part, pattern_part) in path_parts.iter().zip(pattern_parts.iter()) {
// Handle array indices
let path_base = path_part.split('[').next().unwrap_or(path_part);
let pattern_base = pattern_part.split('[').next().unwrap_or(pattern_part);
if pattern_base == "*" {
continue; // Wildcard matches anything
}
if path_base != pattern_base {
return false;
}
}
true
}
/// Get the type name of a JSON value for error messages.
fn expected_type_name(value: &Value) -> &'static str {
match value {
Value::Null => "null",
Value::Bool(_) => "boolean",
Value::Number(_) => "number",
Value::String(_) => "string",
Value::Array(_) => "array",
Value::Object(_) => "object",
}
}
/// Run the "extract" method test case.
fn run_extract_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
// Skip URLs if remote feature is not enabled
if case.fixture.starts_with("http") && !cfg!(feature = "remote") {
return Ok((Value::Null, vec![
format!("Remote sources require 'remote' feature")
]));
}
let options = options_from_value(&case.options);
let result = extract_pdf(&fixture_path, &options)
.map_err(|e| anyhow!("Extract failed: {}", e))?;
let json_value = result_to_json_value(&result);
// Compare against expected
let tolerances = case.tolerances.as_ref().unwrap_or(&Value::Object(Map::new()));
let errors = compare_with_tolerances(&json_value, &case.expected, tolerances, "");
Ok((json_value, errors))
}
/// Run the "extract_text" method test case.
fn run_extract_text_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
let options = options_from_value(&case.options);
let text = extract_text(&fixture_path, &options)
.map_err(|e| anyhow!("Extract text failed: {}", e))?;
let mut result = serde_json::json!({
"output_type": "string",
"text": text,
"length": text.len(),
});
// Check contains expectations
if let Some(contains_arr) = case.expected.get("contains") {
let missing: Vec<&str> = contains_arr
.as_array()
.unwrap_or(&vec![])
.iter()
.filter_map(|v| v.as_str())
.filter(|s| !text.contains(s))
.collect();
if !missing.is_empty() {
return Ok((result, vec![
format!("Text missing expected substrings: {:?}", missing)
]));
}
}
let errors = compare_with_tolerances(&result, &case.expected, &Value::Object(Map::new()), "");
Ok((result, errors))
}
/// Run the "extract_markdown" method test case.
fn run_extract_markdown_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
let options = options_from_value(&case.options);
let extract_result = extract_pdf(&fixture_path, &options)
.map_err(|e| anyhow!("Extract failed: {}", e))?;
let mut markdown = String::new();
for page in &extract_result.pages {
let page_md = page_to_markdown(page, &extract_result.metadata);
markdown.push_str(&page_md);
markdown.push_str("\n\n");
}
let mut result = serde_json::json!({
"output_type": "string",
"markdown": markdown,
"length": markdown.len(),
});
// Check contains expectations
if let Some(contains_arr) = case.expected.get("contains") {
let missing: Vec<&str> = contains_arr
.as_array()
.unwrap_or(&vec![])
.iter()
.filter_map(|v| v.as_str())
.filter(|s| !markdown.contains(s))
.collect();
if !missing.is_empty() {
return Ok((result, vec![
format!("Markdown missing expected substrings: {:?}", missing)
]));
}
}
let errors = compare_with_tolerances(&result, &case.expected, &Value::Object(Map::new()), "");
Ok((result, errors))
}
/// Run the "extract_stream" method test case.
fn run_extract_stream_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
let options = options_from_value(&case.options);
let mut buffer = Vec::new();
extract_pdf_ndjson(&fixture_path, &options, &mut buffer)
.map_err(|e| anyhow!("Extract stream failed: {}", e))?;
let output = String::from_utf8(buffer)
.map_err(|e| anyhow!("Output not valid UTF-8: {}", e))?;
// Parse NDJSON lines
let lines: Vec<&str> = output.lines().collect();
let mut result = serde_json::json!({
"output_type": "iterator",
"frame_count": lines.len(),
});
// Check expectations
if let Some(min) = case.expected.get("frame_count").and_then(|v| v.get("min")).and_then(|v| v.as_u64()) {
if lines.len() < min as usize {
return Ok((result, vec![
format!("Expected at least {} frames, got {}", min, lines.len())
]));
}
}
// Analyze frames - each line is a page JSON object
let mut page_count = 0;
for line in &lines {
if let Ok(frame) = serde_json::from_str::<Value>(line) {
// Check if this is a page frame (has index field)
if frame.get("index").is_some() {
page_count += 1;
}
}
}
result["page_frames"] = serde_json::json!(page_count);
let errors = compare_with_tolerances(&result, &case.expected, &Value::Object(Map::new()), "");
Ok((result, errors))
}
/// Run the "search" method test case.
/// TODO: Search is not yet implemented in pdftract-core public API.
fn run_search_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let _ = case; // Suppress unused warning
Ok((serde_json::json!({"output_type": "iterator", "match_count": 0}), vec![
"Search not yet implemented in pdftract-core public API".to_string()
]))
}
/// Run the "get_metadata" method test case.
/// TODO: get_metadata needs a public API wrapper.
fn run_get_metadata_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
// Extract to get page count and basic metadata
let options = options_from_value(&case.options);
let result = extract_pdf(&fixture_path, &options)
.map_err(|e| anyhow!("Extract failed: {}", e))?;
let actual_result = serde_json::json!({
"metadata": {
"page_count": result.metadata.page_count,
}
});
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(HashMap::new()), "");
Ok((actual_result, errors))
}
/// Run the "hash" method test case.
/// TODO: hash needs a public API wrapper.
fn run_hash_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let fixture_path = resolve_fixture_path(&case.fixture);
// Extract to get the fingerprint
let options = options_from_value(&case.options);
let result = extract_pdf(&fixture_path, &options)
.map_err(|e| anyhow!("Extract failed: {}", e))?;
let fingerprint = result.fingerprint;
let actual_result = serde_json::json!({
"hash_type": "sha256",
"hash": fingerprint,
"page_count": result.metadata.page_count,
"hash.length": fingerprint.len(),
});
let errors = compare_with_tolerances(&actual_result, &case.expected, &Value::Object(HashMap::new()), "");
Ok((actual_result, errors))
}
/// Run the "classify" method test case.
/// TODO: classify needs a public API wrapper.
fn run_classify_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let _ = case; // Suppress unused warning
#[cfg(feature = "profiles")]
{
Ok((serde_json::json!({"category": "unknown", "confidence": 0.0}), vec![
"Classification not yet implemented in conformance tests".to_string()
]))
}
#[cfg(not(feature = "profiles"))]
{
Ok((serde_json::json!({"output_type": "error"}), vec![
"Classification requires 'profiles' feature".to_string()
]))
}
}
/// Run the "verify_receipt" method test case.
/// TODO: verify_receipt needs a public API wrapper.
fn run_verify_receipt_test(case: &TestCase) -> Result<(Value, Vec<String>)> {
let _ = case; // Suppress unused warning
#[cfg(feature = "receipts")]
{
Ok((serde_json::json!({
"valid": false,
"reason": "Receipt verification not yet implemented in conformance tests"
}), vec![]))
}
#[cfg(not(feature = "receipts"))]
{
Ok((serde_json::json!({"output_type": "error"}), vec![
"Receipt verification requires 'receipts' feature".to_string()
]))
}
}
/// Convert ExtractionResult to JSON value for comparison.
fn result_to_json_value(result: &ExtractionResult) -> Value {
serde_json::json!({
"schema_version": "1.0",
"metadata": {
"page_count": result.metadata.page_count,
},
"pages": result.pages.iter().map(|page| {
serde_json::json!({
"page_index": page.index,
"width": page.width,
"height": page.height,
"rotation": page.rotation,
"spans": page.spans.len(),
"blocks": page.blocks.len(),
"blocks[0].kind": page.blocks.first().map(|b| b.kind.clone()).unwrap_or_else(|| "none".to_string()),
})
}).collect::<Vec<_>>(),
"errors": serde_json::json!([]),
})
}
/// Load the conformance suite from cases.json.
fn load_conformance_suite() -> Result<ConformanceSuite> {
let suite_path = PathBuf::from("tests/sdk-conformance/cases.json");
let suite_content = fs::read_to_string(&suite_path)
.map_err(|e| anyhow!("Failed to read conformance suite: {}", e))?;
let suite: ConformanceSuite = serde_json::from_str(&suite_content)
.map_err(|e| anyhow!("Failed to parse conformance suite: {}", e))?;
Ok(suite)
}
/// Run all test cases in the conformance suite.
fn run_all_tests() -> Vec<TestResult> {
let suite = match load_conformance_suite() {
Ok(s) => s,
Err(e) => {
eprintln!("Failed to load conformance suite: {}", e);
return vec![];
}
};
let mut results = Vec::new();
for case in &suite.cases {
let mut test_result = TestResult {
id: case.id.clone(),
passed: false,
skipped: false,
skip_reason: None,
errors: Vec::new(),
};
// Check for explicit skip
if let Some(reason) = &case.skip_reason {
test_result.skipped = true;
test_result.skip_reason = Some(reason.clone());
results.push(test_result);
continue;
}
// Check feature gating
if let Some(feature) = &case.feature {
if !is_feature_enabled(feature) {
test_result.skipped = true;
test_result.skip_reason = Some(format!("Feature '{}' not enabled", feature));
results.push(test_result);
continue;
}
}
// Run the test
let run_result = match case.method.as_str() {
"extract" => run_extract_test(case),
"extract_text" => run_extract_text_test(case),
"extract_markdown" => run_extract_markdown_test(case),
"extract_stream" => run_extract_stream_test(case),
"search" => run_search_test(case),
"get_metadata" => run_get_metadata_test(case),
"hash" => run_hash_test(case),
"classify" => run_classify_test(case),
"verify_receipt" => run_verify_receipt_test(case),
_ => Err(anyhow!("Unknown method: {}", case.method)),
};
match run_result {
Ok((_actual, errors)) => {
test_result.errors = errors;
test_result.passed = test_result.errors.is_empty();
}
Err(e) => {
test_result.errors.push(format!("Test execution error: {}", e));
test_result.passed = false;
}
}
results.push(test_result);
}
results
}
#[test]
fn test_sdk_conformance() {
let results = run_all_tests();
let mut passed = 0;
let mut skipped = 0;
let mut failed = 0;
for result in &results {
if result.skipped {
skipped += 1;
println!("SKIP: {} - {}", result.id, result.skip_reason.as_ref().unwrap_or(&"?".to_string()));
} else if result.passed {
passed += 1;
println!("PASS: {}", result.id);
} else {
failed += 1;
eprintln!("FAIL: {}", result.id);
for error in &result.errors {
eprintln!(" - {}", error);
}
}
}
println!("\nConformance test results:");
println!(" Passed: {}", passed);
println!(" Skipped: {}", skipped);
println!(" Failed: {}", failed);
// The test passes if all non-skipped tests passed
if failed > 0 {
panic!("{} conformance test(s) failed", failed);
}
}