pdftract/crates/pdftract-cli/src/codegen.rs
jedarden e6bf3dd290 feat(pdftract-3s2i): implement Phase 5.5.2 validation filter
Implement per-word validation filter for assisted-OCR BrokenVector path.

Changes:
- Add SpanSource::OcrAssisted variant to hybrid.rs
- Add Span::ocr_assisted() helper method
- Implement validate_ocr_with_position_hints() in ocr.rs
  - 5pt distance threshold for position validation
  - 0.4 confidence cap for rejected words
  - Linear scan for nearest-neighbor lookup
- Add unit tests for validation filter

Closes: pdftract-3s2i

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 04:57:17 -04:00

756 lines
26 KiB
Rust

use anyhow::{Context, Result};
use chrono::Utc;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use tera::{Tera, Value};
use walkdir::WalkDir;
/// Supported languages for code generation.
#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
pub enum Language {
Python,
Rust,
Node,
Go,
Java,
Dotnet,
Ruby,
Php,
Swift,
}
impl Language {
/// Returns the template directory name for this language.
pub fn template_dir(&self) -> &str {
match self {
Language::Python => "python",
Language::Rust => "rust",
Language::Node => "node",
Language::Go => "go",
Language::Java => "java",
Language::Dotnet => "dotnet",
Language::Ruby => "ruby",
Language::Php => "php",
Language::Swift => "swift",
}
}
/// Returns the file extension for generated files (where applicable).
pub fn source_ext(&self) -> &str {
match self {
Language::Python => "py",
Language::Rust => "rs",
Language::Node => "ts",
Language::Go => "go",
Language::Java => "java",
Language::Dotnet => "cs",
Language::Ruby => "rb",
Language::Php => "php",
Language::Swift => "swift",
}
}
}
/// SDK contract definition.
#[derive(Debug, Serialize, Deserialize)]
pub struct SdkContract {
pub version: String,
pub methods: Vec<Method>,
pub errors: Vec<Error>,
}
/// SDK method definition.
#[derive(Debug, Serialize, Deserialize)]
pub struct Method {
pub name: String,
pub camel_name: String,
pub snake_name: String,
pub description: String,
pub cli_flag: String,
pub returns_string: bool,
pub has_options: bool,
pub options_type: String,
pub return_type: String,
/// True if this method uses string parameters instead of Source (e.g., verify_receipt)
pub uses_string_params: bool,
/// Number of string parameters if uses_string_params is true
pub string_param_count: usize,
}
impl Method {
/// Returns the snake_case name for Python/Ruby SDKs.
pub fn snake_name(&self) -> &str {
&self.snake_name
}
}
/// SDK error definition.
#[derive(Debug, Serialize, Deserialize)]
pub struct Error {
pub exit_code: i32,
pub exception_name: String,
pub description: String,
}
/// Code generator context.
pub struct CodeGenerator {
tera: Tera,
contract: SdkContract,
version: String,
}
impl CodeGenerator {
/// Creates a new code generator.
pub fn new(template_dir: &Path, version: String) -> Result<Self> {
let template_path = template_dir.join("**/*.tera");
let mut tera = Tera::new(&template_path.to_string_lossy())
.with_context(|| format!("Failed to load templates from {:?}", template_dir))?;
tera.register_function("now", |_args: &HashMap<String, Value>| {
Ok(Value::String(Utc::now().to_rfc3339()))
});
let contract = Self::load_contract()?;
Ok(Self {
tera,
contract,
version,
})
}
/// Loads the SDK contract from docs/notes/sdk-contract.md.
fn load_contract() -> Result<SdkContract> {
let contract_path = PathBuf::from("docs/notes/sdk-contract.md");
// Try to load from the markdown file, fall back to hardcoded contract
if contract_path.exists() {
match Self::parse_contract_from_markdown(&contract_path) {
Ok(contract) => {
eprintln!("Loaded SDK contract from {:?}", contract_path);
return Ok(contract);
}
Err(e) => {
eprintln!(
"Warning: Failed to parse SDK contract from {:?}: {}",
contract_path, e
);
eprintln!("Falling back to hardcoded contract");
}
}
} else {
eprintln!(
"Warning: SDK contract file not found at {:?}, using hardcoded contract",
contract_path
);
}
// Hardcoded fallback contract
Ok(Self::hardcoded_contract())
}
/// Parses the SDK contract from the markdown file.
fn parse_contract_from_markdown(path: &Path) -> Result<SdkContract> {
let content = fs::read_to_string(path)?;
let mut methods = Vec::new();
let mut errors = Vec::new();
// Parse method signatures from the Method surface section
let _method_sig_re =
Regex::new(r"\*\*([a-z_]+)\*\*\s*\n\s*- Signature: [`']?([a-zA-Z0-9_<>():?,\s]+)[`']?")
.unwrap();
let _method_table_re = Regex::new(r"\| [`']?([a-z_]+)[`']?\|").unwrap();
// Parse method table for CLI mappings
let _cli_mappings: HashMap<String, (String, String)> = HashMap::new();
let _in_method_table = content.contains("## Method surface");
// TODO: Implement full contract parsing from markdown
// For now, we use the hardcoded contract below
// Parse each method from the "Method signatures" section
let _signatures_start = content.find("### Method signatures").unwrap_or(0);
let _signatures_section = content[_signatures_start..].to_string();
// Method definitions with their details
let method_patterns = [
(
"extract",
"Extract",
"extract",
"extract",
"Document",
"ExtractOptions",
"Extract structured data from a PDF",
false,
false,
0,
),
(
"extract_text",
"ExtractText",
"extract_text",
"extract",
"string",
"ExtractOptions",
"Extract plain text from a PDF",
true,
false,
0,
),
(
"extract_markdown",
"ExtractMarkdown",
"extract_markdown",
"extract",
"string",
"ExtractOptions",
"Extract Markdown-formatted text from a PDF",
true,
false,
0,
),
(
"extract_stream",
"ExtractStream",
"extract_stream",
"extract",
"Page",
"ExtractOptions",
"Extract pages from a PDF as a stream",
false,
false,
0,
),
(
"search",
"Search",
"search",
"grep",
"Match",
"SearchOptions",
"Search for text in a PDF",
false,
false,
0,
),
(
"get_metadata",
"GetMetadata",
"get_metadata",
"extract",
"Metadata",
"BaseOptions",
"Get metadata from a PDF",
false,
false,
0,
),
(
"hash",
"Hash",
"hash",
"hash",
"Fingerprint",
"BaseOptions",
"Compute hash fingerprint of a PDF",
false,
false,
0,
),
(
"classify",
"Classify",
"classify",
"classify",
"Classification",
"",
"Classify a PDF document",
false,
false,
0,
),
(
"verify_receipt",
"VerifyReceipt",
"verify_receipt",
"verify-receipt",
"bool",
"",
"Verify a receipt",
false,
true,
2,
),
];
for (
name,
camel_name,
snake_name,
cli_flag,
return_type,
options_type,
description,
returns_string,
uses_string_params,
string_param_count,
) in method_patterns
{
methods.push(Method {
name: name.to_string(),
camel_name: camel_name.to_string(),
snake_name: snake_name.to_string(),
description: description.to_string(),
cli_flag: cli_flag.to_string(),
returns_string,
has_options: !options_type.is_empty(),
options_type: options_type.to_string(),
return_type: return_type.to_string(),
uses_string_params,
string_param_count,
});
}
// Parse error mapping table from the Error mapping section
let error_mapping_start = content.find("## Error mapping").unwrap_or(0);
let error_mapping_end = content
.find("### Per-language base exception types")
.unwrap_or(content.len());
let error_mapping_section = content[error_mapping_start..error_mapping_end].to_string();
// The error table has the format: | Exit code | Meaning | Native exception |
// We need to find the table header and then parse the rows
let error_re =
Regex::new(r"\|\s*(\d+)\s*\|\s*([^|]+?)\s*\|\s*`?([a-zA-Z]+)`?\s*\|").unwrap();
for cap in error_re.captures_iter(&error_mapping_section) {
if let (Some(exit_code_str), Some(meaning), Some(exception_name)) =
(cap.get(1), cap.get(2), cap.get(3))
{
if let Ok(exit_code) = exit_code_str.as_str().parse::<i32>() {
let name = exception_name.as_str().trim().to_string();
// Skip the generic "any other non-zero" entry and malformed matches
if !name.contains("any other")
&& name
.chars()
.next()
.map_or(false, |c| c.is_ascii_alphabetic())
{
errors.push(Error {
exit_code,
exception_name: name,
description: meaning.as_str().trim().to_string(),
});
}
}
}
}
Ok(SdkContract {
version: "1.0".to_string(),
methods,
errors,
})
}
/// Returns the hardcoded fallback SDK contract.
fn hardcoded_contract() -> SdkContract {
SdkContract {
version: "1.0".to_string(),
methods: vec![
Method {
name: "extract".to_string(),
camel_name: "Extract".to_string(),
snake_name: "extract".to_string(),
description: "Extract structured data from a PDF".to_string(),
cli_flag: "extract".to_string(),
returns_string: false,
has_options: true,
options_type: "ExtractOptions".to_string(),
return_type: "Document".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "extract_text".to_string(),
camel_name: "ExtractText".to_string(),
snake_name: "extract_text".to_string(),
description: "Extract plain text from a PDF".to_string(),
cli_flag: "extract".to_string(),
returns_string: true,
has_options: true,
options_type: "ExtractOptions".to_string(),
return_type: "string".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "extract_markdown".to_string(),
camel_name: "ExtractMarkdown".to_string(),
snake_name: "extract_markdown".to_string(),
description: "Extract Markdown-formatted text from a PDF".to_string(),
cli_flag: "extract".to_string(),
returns_string: true,
has_options: true,
options_type: "ExtractOptions".to_string(),
return_type: "string".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "extract_stream".to_string(),
camel_name: "ExtractStream".to_string(),
snake_name: "extract_stream".to_string(),
description: "Extract pages from a PDF as a stream".to_string(),
cli_flag: "extract".to_string(),
returns_string: false,
has_options: true,
options_type: "ExtractOptions".to_string(),
return_type: "Page".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "search".to_string(),
camel_name: "Search".to_string(),
snake_name: "search".to_string(),
description: "Search for text in a PDF".to_string(),
cli_flag: "grep".to_string(),
returns_string: false,
has_options: true,
options_type: "SearchOptions".to_string(),
return_type: "Match".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "get_metadata".to_string(),
camel_name: "GetMetadata".to_string(),
snake_name: "get_metadata".to_string(),
description: "Get metadata from a PDF".to_string(),
cli_flag: "extract".to_string(),
returns_string: false,
has_options: true,
options_type: "BaseOptions".to_string(),
return_type: "Metadata".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "hash".to_string(),
camel_name: "Hash".to_string(),
snake_name: "hash".to_string(),
description: "Compute hash fingerprint of a PDF".to_string(),
cli_flag: "hash".to_string(),
returns_string: false,
has_options: true,
options_type: "BaseOptions".to_string(),
return_type: "Fingerprint".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "classify".to_string(),
camel_name: "Classify".to_string(),
snake_name: "classify".to_string(),
description: "Classify a PDF document".to_string(),
cli_flag: "classify".to_string(),
returns_string: false,
has_options: false,
options_type: "".to_string(),
return_type: "Classification".to_string(),
uses_string_params: false,
string_param_count: 0,
},
Method {
name: "verify_receipt".to_string(),
camel_name: "VerifyReceipt".to_string(),
snake_name: "verify_receipt".to_string(),
description: "Verify a receipt".to_string(),
cli_flag: "verify-receipt".to_string(),
returns_string: false,
has_options: false,
options_type: "".to_string(),
return_type: "bool".to_string(),
uses_string_params: true,
string_param_count: 2,
},
],
errors: vec![
Error {
exit_code: 0,
exception_name: "Success".to_string(),
description: "Success - no error".to_string(),
},
Error {
exit_code: 2,
exception_name: "CorruptPdfError".to_string(),
description: "The PDF file is corrupt or invalid".to_string(),
},
Error {
exit_code: 3,
exception_name: "EncryptionError".to_string(),
description: "The PDF is encrypted and password is missing or wrong"
.to_string(),
},
Error {
exit_code: 4,
exception_name: "SourceUnreachableError".to_string(),
description: "The source (file or URL) is unreadable".to_string(),
},
Error {
exit_code: 5,
exception_name: "RemoteFetchInterruptedError".to_string(),
description: "Network interrupted during remote fetch".to_string(),
},
Error {
exit_code: 6,
exception_name: "TlsError".to_string(),
description: "TLS certificate validation failed".to_string(),
},
Error {
exit_code: 10,
exception_name: "ReceiptVerifyError".to_string(),
description: "Receipt verification failed".to_string(),
},
],
}
}
/// Generates the SDK for the given language.
pub fn generate(&mut self, lang: Language, output_dir: &Path) -> Result<()> {
// Check if output directory exists and is non-empty
if output_dir.exists() {
let entries = fs::read_dir(output_dir)?;
let has_files = entries.count() > 0;
if has_files {
// Check for GENERATED marker
let marker = output_dir.join("GENERATED");
if !marker.exists() {
anyhow::bail!(
"Output directory {:?} exists but lacks GENERATED marker. \
Refusing to overwrite hand-written code.",
output_dir
);
}
}
} else {
fs::create_dir_all(output_dir)
.with_context(|| format!("Failed to create output directory {:?}", output_dir))?;
}
let template_dir = PathBuf::from("templates/sdk-skeleton").join(lang.template_dir());
if !template_dir.exists() {
anyhow::bail!(
"Template directory for {:?} does not exist: {:?}",
lang,
template_dir
);
}
// Walk the template directory and render each file
for entry in WalkDir::new(&template_dir)
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if path.is_dir() {
continue;
}
let rel_path = path.strip_prefix(&template_dir)?;
let output_path = output_dir.join(rel_path);
// Remove .tera suffix for output files
let output_path = if output_path.extension().map_or(false, |e| e == "tera") {
let mut p = output_path.clone();
p.set_extension("");
p
} else {
output_path
};
// Create parent directories
if let Some(parent) = output_path.parent() {
fs::create_dir_all(parent)?;
}
// Read template
let template_content = fs::read_to_string(path)?;
let template_name = rel_path.to_string_lossy().replace("\\", "/");
// Register template if it contains Tera syntax
if template_content.contains("{{") || template_content.contains("{%") {
self.tera
.add_raw_template(&template_name, &template_content)?;
}
// Build context
let mut context = tera::Context::new();
context.insert("version", &self.version);
context.insert("methods", &self.contract.methods);
context.insert("errors", &self.contract.errors);
context.insert("generated_at", &Utc::now().to_rfc3339());
context.insert("language_metadata", &Self::language_metadata(lang));
// Render template
let rendered = if template_content.contains("{{") || template_content.contains("{%") {
self.tera.render(&template_name, &context)?
} else {
// Static file - copy as-is
template_content
};
// Write output
fs::write(&output_path, rendered)?;
println!("Generated: {}", output_path.display());
}
// Write .codegen-version file
let version_file = output_dir.join(".codegen-version");
let version_content = format!("{}\n", self.version);
fs::write(&version_file, version_content)?;
println!("Generated: {}", version_file.display());
Ok(())
}
/// Files that should be excluded from validation comparison.
fn should_exclude_from_validation(path: &Path) -> bool {
let file_name = path.file_name().and_then(|n| n.to_str());
matches!(
file_name,
Some("GENERATED") | Some(".codegen-version") | Some(".gitignore")
)
}
/// Validates an existing SDK against the current generator output.
pub fn validate(&mut self, lang: Language, sdk_dir: &Path) -> Result<ValidationResult> {
use tempfile::TempDir;
// Generate to a temp directory
let temp_dir = TempDir::new()?;
self.generate(lang, temp_dir.path())?;
let mut differences = Vec::new();
// Compare generated files with existing SDK
for entry in WalkDir::new(temp_dir.path())
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if path.is_dir() {
continue;
}
let rel_path = path.strip_prefix(temp_dir.path())?;
// Skip excluded files
if Self::should_exclude_from_validation(rel_path) {
continue;
}
let existing_path = sdk_dir.join(rel_path);
if !existing_path.exists() {
differences.push(FileDifference {
path: rel_path.to_string_lossy().to_string(),
kind: DifferenceKind::MissingInSdk,
});
continue;
}
let generated_content = fs::read_to_string(path)?;
let existing_content = fs::read_to_string(&existing_path)?;
if generated_content != existing_content {
differences.push(FileDifference {
path: rel_path.to_string_lossy().to_string(),
kind: DifferenceKind::ContentDiff,
});
}
}
// Check for files in SDK that aren't in generated output
for entry in WalkDir::new(sdk_dir).into_iter().filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_dir() {
continue;
}
let rel_path = path.strip_prefix(sdk_dir)?;
// Skip excluded files
if Self::should_exclude_from_validation(rel_path) {
continue;
}
let generated_path = temp_dir.path().join(rel_path);
if !generated_path.exists() {
differences.push(FileDifference {
path: rel_path.to_string_lossy().to_string(),
kind: DifferenceKind::ExtraInSdk,
});
}
}
Ok(ValidationResult { differences })
}
/// Returns language-specific metadata for templates.
fn language_metadata(lang: Language) -> Value {
match lang {
Language::Go => serde_json::json!({
"package_manager": "go modules",
"package_name": "github.com/jedarden/pdftract-go",
"naming_convention": "PascalCase for exported, camelCase for private",
"cli_flag_style": "PascalCase",
}),
Language::Python => serde_json::json!({
"package_manager": "pip",
"package_name": "pdftract",
"naming_convention": "snake_case",
"cli_flag_style": "snake_case",
}),
Language::Node => serde_json::json!({
"package_manager": "npm",
"package_name": "@pdftract/sdk",
"naming_convention": "camelCase",
"cli_flag_style": "camelCase",
}),
Language::Rust => serde_json::json!({
"package_manager": "cargo",
"package_name": "pdftract",
"naming_convention": "snake_case",
"cli_flag_style": "snake_case",
}),
_ => serde_json::json!({}),
}
}
}
#[derive(Debug)]
pub struct ValidationResult {
pub differences: Vec<FileDifference>,
}
#[derive(Debug)]
pub struct FileDifference {
pub path: String,
pub kind: DifferenceKind,
}
#[derive(Debug)]
pub enum DifferenceKind {
MissingInSdk,
ExtraInSdk,
ContentDiff,
}