//! C FFI API for pdftract. //! //! This module provides the extern "C" API surface for C/C++ integrations. //! All functions return owned JSON strings that must be freed with pdftract_free(). //! Panics are caught at the FFI boundary and converted to JSON errors. //! //! # Memory management //! //! - All functions except pdftract_version() return owned strings //! - The caller MUST free these strings with pdftract_free() //! - Do not call libc free() on these pointers (Rust allocator mismatch) //! //! # Error handling //! //! All errors are returned as JSON objects with the shape: //! ```json //! {"error":"CODE","message":"..."} //! ``` use libc::{c_char, c_void}; use pdftract_core::document::{compute_pdf_fingerprint, parse_pdf_file, PdfExtractor}; use pdftract_core::extract::{extract_pdf, result_to_json}; use pdftract_core::options::ExtractionOptions; use pdftract_core::receipts::{ verifier::{exit_code, verify_receipt, SpanData, VerificationResult}, Receipt, }; use std::default::Default; use std::ffi::{CStr, CString}; use std::panic::catch_unwind; use std::path::Path; use std::sync::Mutex; /// Error codes returned in JSON error responses. mod error_codes { pub const NULL_POINTER: &str = "NULL_POINTER"; pub const INVALID_UTF8: &str = "INVALID_UTF8"; pub const INVALID_JSON: &str = "INVALID_JSON"; pub const EXTRACTION_ERROR: &str = "EXTRACTION_ERROR"; pub const FILE_NOT_FOUND: &str = "FILE_NOT_FOUND"; pub const PARSE_ERROR: &str = "PARSE_ERROR"; pub const PANIC: &str = "PANIC"; pub const NOT_IMPLEMENTED: &str = "NOT_IMPLEMENTED"; pub const INVALID_HANDLE: &str = "INVALID_HANDLE"; } /// Convert an error to a JSON error string. fn json_error(code: &str, message: &str) -> String { format!( r#"{{"error":"{}","message":"{}"}}"#, code, escape_json(message) ) } /// Escape a string for JSON (minimal escaping). fn escape_json(s: &str) -> String { s.replace('\\', "\\\\") .replace('"', "\\\"") .replace('\n', "\\n") .replace('\r', "\\r") .replace('\t', "\\t") } /// Convert an anyhow::Error to a JSON error string. fn anyhow_to_json_error(err: anyhow::Error) -> String { let message = err.to_string(); // Try to determine a more specific error code let code = if err.chain().any(|e| e.to_string().contains("No such file")) { error_codes::FILE_NOT_FOUND } else if err.chain().any(|e| e.to_string().contains("UTF-8")) { error_codes::INVALID_UTF8 } else { error_codes::EXTRACTION_ERROR }; json_error(code, &message) } /// Convert a C string pointer to a Rust string, handling null and invalid UTF-8. unsafe fn cstr_to_string(ptr: *const c_char) -> Result { if ptr.is_null() { return Err("null pointer"); } CStr::from_ptr(ptr) .to_str() .map(|s| s.to_string()) .map_err(|_| error_codes::INVALID_UTF8) } /// Parse options JSON, returning an error string on failure. fn parse_options_json(options_json: &str) -> Result { serde_json::from_str(options_json).map_err(|e| format!("Invalid options JSON: {}", e)) } /// Result type for FFI operations that can fail. enum FfiResult { Ok(String), Err(String), } /// Extract text and structure from a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// A JSON string representing the extraction result. The caller MUST free this /// with pdftract_free(). On error, returns a JSON object with "error" and "message" fields. /// /// # Example /// /// ```c /// char *result = pdftract_extract("document.pdf", "{}"); /// // ... use result ... /// pdftract_free(result); /// ``` #[no_mangle] pub extern "C" fn pdftract_extract( source: *const c_char, options_json: *const c_char, ) -> *mut c_char { let result = catch_unwind(|| unsafe { // Validate and convert arguments let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )) } }; // Parse options let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => return FfiResult::Err(json_error(error_codes::INVALID_JSON, &e)), }; // Perform extraction let pdf_path = Path::new(&source_path); let extraction_result = match extract_pdf(pdf_path, &options) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; // Convert to JSON let json_value = result_to_json(&extraction_result); match serde_json::to_string(&json_value) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error(error_codes::PANIC, "panic in pdftract_extract")) .unwrap() .into_raw(), } } /// Extract plain text from a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// A JSON string containing the extracted text. The caller MUST free this /// with pdftract_free(). #[no_mangle] pub extern "C" fn pdftract_extract_text( source: *const c_char, options_json: *const c_char, ) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )) } }; let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => return FfiResult::Err(json_error(error_codes::INVALID_JSON, &e)), }; let pdf_path = Path::new(&source_path); let extraction_result = match extract_pdf(pdf_path, &options) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; // Extract just the text from all pages let text: String = extraction_result .pages .iter() .flat_map(|page| page.spans.iter().map(|span| span.text.as_str())) .collect::>() .join(" "); match serde_json::to_string(&text) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error( error_codes::PANIC, "panic in pdftract_extract_text", )) .unwrap() .into_raw(), } } /// Extract markdown from a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// A JSON string containing the extracted markdown. The caller MUST free this /// with pdftract_free(). #[no_mangle] pub extern "C" fn pdftract_extract_markdown( source: *const c_char, options_json: *const c_char, ) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )) } }; let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => return FfiResult::Err(json_error(error_codes::INVALID_JSON, &e)), }; let pdf_path = Path::new(&source_path); let extraction_result = match extract_pdf(pdf_path, &options) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; // Convert blocks to markdown let markdown: String = extraction_result .pages .iter() .flat_map(|page| page.blocks.iter()) .map(|block| match block.kind.as_str() { "heading" => { let level = block.level.unwrap_or(1); let hashes = "#".repeat(level as usize); format!("{} {}\n\n", hashes, block.text) } "paragraph" => format!("{}\n\n", block.text), "list" => format!("- {}\n", block.text), _ => format!("{}\n\n", block.text), }) .collect(); match serde_json::to_string(&markdown) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error( error_codes::PANIC, "panic in pdftract_extract_markdown", )) .unwrap() .into_raw(), } } /// Stream state for iterative page extraction. /// /// This struct holds a PdfExtractor and extracts pages on-demand, /// ensuring that we never materialize the entire document in memory. struct StreamState { /// The PDF extractor for lazy page iteration extractor: PdfExtractor, /// Lazy page iterator (created on first call to next()) page_iter: Option>, /// Current page index (for tracking progress) current_index: usize, /// Extraction options (cached for reuse) options: ExtractionOptions, } /// Open a streaming extraction session. /// /// Returns an opaque handle that can be used with pdftract_stream_next() /// to iterate through pages one at a time. When done, call pdftract_stream_close(). /// /// # Memory Efficiency /// /// This function does NOT materialize all pages. It creates a PdfExtractor /// that will extract each page on-demand when pdftract_stream_next() is called. /// This ensures memory usage stays bounded regardless of document size. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// An opaque handle (*mut c_void) on success, or NULL on error. /// Check for errors by examining the handle. #[no_mangle] pub extern "C" fn pdftract_extract_stream_open( source: *const c_char, options_json: *const c_char, ) -> *mut c_void { clear_last_error(); let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(e) => { set_last_error(json_error( error_codes::NULL_POINTER, "source pointer is null", )); return None; } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(e) => { set_last_error(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )); return None; } }; let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => { set_last_error(json_error(error_codes::INVALID_JSON, &e)); return None; } }; let pdf_path = Path::new(&source_path); // Use PdfExtractor for lazy page iteration // This does NOT materialize all pages upfront let extractor = match PdfExtractor::open(pdf_path) { Ok(ex) => ex, Err(e) => { set_last_error(anyhow_to_json_error(e)); return None; } }; Some(StreamState { extractor, page_iter: None, current_index: 0, options, }) }); match result { Ok(Some(state)) => Box::into_raw(Box::new(state)) as *mut c_void, Ok(None) => std::ptr::null_mut(), Err(_) => { set_last_error(json_error( error_codes::PANIC, "panic in pdftract_extract_stream_open", )); std::ptr::null_mut() } } } /// Get the next page from a streaming extraction session. /// /// # Memory Efficiency /// /// This function extracts one page at a time on-demand. The page's /// content streams are decoded, the result is serialized to JSON, /// and then all page data is dropped before returning. This ensures /// memory usage stays bounded. /// /// # Arguments /// /// * `handle` - Opaque handle from pdftract_extract_stream_open() /// /// # Returns /// /// A JSON string representing one page, or NULL when the stream ends. /// The caller MUST free non-NULL returns with pdftract_free(). /// /// # Note /// /// The handle remains valid after this call and must be closed with /// pdftract_stream_close() when done. #[no_mangle] pub extern "C" fn pdftract_stream_next(handle: *mut c_void) -> *mut c_char { if handle.is_null() { return CString::new(json_error(error_codes::INVALID_HANDLE, "null handle")) .unwrap() .into_raw(); } let result = catch_unwind(|| -> Option<*mut c_char> { unsafe { // Get a mutable reference to the state let state = &mut *(handle as *mut StreamState); // Initialize the lazy iterator on first call if state.page_iter.is_none() { state.page_iter = Some(state.extractor.pages()); } // Get the next page from the lazy iterator // This walks the page tree depth-first, materializing only the current path let iter = state.page_iter.as_mut()?; let page_extraction = match iter.next() { Some(Ok(page)) => page, Some(Err(e)) => { // Return an error page instead of failing let error_json = serde_json::json!({ "index": state.current_index, "error": e.to_string(), "spans": [], "blocks": [], }); state.current_index += 1; return Some( CString::new(serde_json::to_string(&error_json).unwrap()) .unwrap() .into_raw(), ); } None => { // Stream ended - return null pointer return None; } }; // Convert to JSON let page_json = serde_json::json!({ "index": page_extraction.index, "spans": page_extraction.spans, "blocks": page_extraction.blocks, }); // Increment the index for the next call state.current_index += 1; // Serialize and return // The page_json is dropped after this call, freeing all page data Some( CString::new(serde_json::to_string(&page_json).unwrap()) .unwrap() .into_raw(), ) } }); match result { Ok(Some(ptr)) => ptr, Ok(None) => std::ptr::null_mut(), Err(_) => CString::new(json_error( error_codes::PANIC, "panic in pdftract_stream_next", )) .unwrap() .into_raw(), } } /// Close a streaming extraction session and free resources. /// /// # Arguments /// /// * `handle` - Opaque handle from pdftract_extract_stream_open() #[no_mangle] pub extern "C" fn pdftract_stream_close(handle: *mut c_void) { if handle.is_null() { return; } let result = catch_unwind(|| unsafe { // Drop the Box let _ = Box::from_raw(handle as *mut StreamState); }); // We can't report errors from a close function, so we just ignore panics let _ = result; } /// Search for text patterns in a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `pattern` - Search pattern (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// A JSON string containing search results. The caller MUST free this /// with pdftract_free(). #[no_mangle] pub extern "C" fn pdftract_search( source: *const c_char, pattern: *const c_char, options_json: *const c_char, ) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let search_pattern = match cstr_to_string(pattern) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "pattern pointer is null", )) } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )) } }; let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => return FfiResult::Err(json_error(error_codes::INVALID_JSON, &e)), }; let pdf_path = Path::new(&source_path); let extraction_result = match extract_pdf(pdf_path, &options) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; // Search for the pattern in spans let mut matches = Vec::new(); for page in &extraction_result.pages { for (span_idx, span) in page.spans.iter().enumerate() { if span.text.contains(&search_pattern) { matches.push(serde_json::json!({ "page": page.index, "span": span_idx, "text": span.text, "bbox": span.bbox, })); } } } match serde_json::to_string(&serde_json::json!({ "pattern": search_pattern, "match_count": matches.len(), "matches": matches, })) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error(error_codes::PANIC, "panic in pdftract_search")) .unwrap() .into_raw(), } } /// Get metadata about a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// * `options_json` - JSON string with extraction options (can be empty object "{}") /// /// # Returns /// /// A JSON string containing PDF metadata. The caller MUST free this /// with pdftract_free(). #[no_mangle] pub extern "C" fn pdftract_get_metadata( source: *const c_char, options_json: *const c_char, ) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let options_str = match cstr_to_string(options_json) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "options_json pointer is null", )) } }; let options: ExtractionOptions = match parse_options_json(&options_str) { Ok(opts) => opts, Err(e) => return FfiResult::Err(json_error(error_codes::INVALID_JSON, &e)), }; let pdf_path = Path::new(&source_path); let extraction_result = match extract_pdf(pdf_path, &options) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; match serde_json::to_string(&serde_json::json!({ "fingerprint": extraction_result.fingerprint, "page_count": extraction_result.metadata.page_count, "span_count": extraction_result.metadata.span_count, "block_count": extraction_result.metadata.block_count, "receipts_mode": extraction_result.metadata.receipts_mode.as_str(), })) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error( error_codes::PANIC, "panic in pdftract_get_metadata", )) .unwrap() .into_raw(), } } /// Compute the cryptographic fingerprint of a PDF file. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// /// # Returns /// /// A JSON string containing the fingerprint. The caller MUST free this /// with pdftract_free(). #[no_mangle] pub extern "C" fn pdftract_hash(source: *const c_char) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let pdf_path = Path::new(&source_path); let fingerprint = match compute_pdf_fingerprint(pdf_path) { Ok(fp) => fp, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; match serde_json::to_string(&serde_json::json!({ "fingerprint": fingerprint, })) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error(error_codes::PANIC, "panic in pdftract_hash")) .unwrap() .into_raw(), } } /// Classify a PDF file by type. /// /// # Arguments /// /// * `source` - Path to the PDF file (null-terminated UTF-8 string) /// /// # Returns /// /// A JSON string containing classification information. The caller MUST free this /// with pdftract_free(). /// /// # Note /// /// This is currently a stub that returns a basic classification. /// Full implementation requires a trained classifier. #[no_mangle] pub extern "C" fn pdftract_classify(source: *const c_char) -> *mut c_char { let result = catch_unwind(|| unsafe { let source_path = match cstr_to_string(source) { Ok(s) => s, Err(_) => { return FfiResult::Err(json_error( error_codes::NULL_POINTER, "source pointer is null", )) } }; let pdf_path = Path::new(&source_path); // Get basic info let (fingerprint, _catalog, pages, _resolver) = match parse_pdf_file(pdf_path) { Ok(result) => result, Err(e) => return FfiResult::Err(anyhow_to_json_error(e)), }; // Basic classification based on page count let doc_type = if pages.len() == 1 { "single_page" } else if pages.len() <= 5 { "short_document" } else { "long_document" }; match serde_json::to_string(&serde_json::json!({ "type": doc_type, "page_count": pages.len(), "fingerprint": fingerprint, "confidence": 0.5, })) { Ok(json) => FfiResult::Ok(json), Err(e) => FfiResult::Err(json_error( error_codes::EXTRACTION_ERROR, &format!("JSON serialization failed: {}", e), )), } }); match result { Ok(FfiResult::Ok(json)) => CString::new(json).unwrap().into_raw(), Ok(FfiResult::Err(err)) => CString::new(err).unwrap().into_raw(), Err(_) => CString::new(json_error(error_codes::PANIC, "panic in pdftract_classify")) .unwrap() .into_raw(), } } /// Free a string returned by pdftract_* functions. /// /// # Arguments /// /// * `ptr` - Pointer to string returned by any pdftract_* function (except pdftract_version) /// /// # Safety /// /// This function MUST be called to free strings returned by the API. /// Do NOT call libc free() on these pointers. #[no_mangle] pub extern "C" fn pdftract_free(ptr: *mut c_char) { if ptr.is_null() { return; } unsafe { let _ = CString::from_raw(ptr); } } /// Get the pdftract library version string. /// /// # Returns /// /// A static C string containing the version. Do NOT free this string. #[no_mangle] pub extern "C" fn pdftract_version() -> *const c_char { // Use a static C string with proper lifetime static VERSION: &[u8] = b"0.1.0\0"; VERSION.as_ptr() as *const c_char } /// Thread-local storage for the last error message. /// /// This allows C callers to retrieve detailed error information after /// a function returns NULL or an error indicator. Each thread has its /// own error storage, making the library thread-safe. thread_local! { static LAST_ERROR: Mutex> = Mutex::new(None); static LAST_ERROR_CSTR: Mutex> = Mutex::new(None); } /// Set the last error message for the current thread. fn set_last_error(message: String) { LAST_ERROR.with(|error| { let mut guard = error.lock().unwrap(); *guard = Some(message); }); } /// Clear the last error message for the current thread. fn clear_last_error() { LAST_ERROR.with(|error| { let mut guard = error.lock().unwrap(); *guard = None; }); LAST_ERROR_CSTR.with(|cstr| { let mut guard = cstr.lock().unwrap(); *guard = None; }); } /// Get the last error message for the current thread. /// /// # Returns /// /// A pointer to a null-terminated string containing the last error message, /// or NULL if no error has been set. The caller MUST NOT free this string. /// The string remains valid until the next API call on this thread. /// /// # Note /// /// This function returns a pointer to thread-local storage that is invalidated /// by the next API call on the same thread. If you need to retain the error /// message, make a copy of it immediately. #[no_mangle] pub extern "C" fn pdftract_last_error() -> *const c_char { LAST_ERROR_CSTR.with(|cstr| { let mut guard = cstr.lock().unwrap(); if let Some(ref c) = *guard { return c.as_ptr(); } // Try to get the error string and convert it to CString LAST_ERROR.with(|error| { let err_guard = error.lock().unwrap(); if let Some(ref msg) = *err_guard { if let Ok(c) = CString::new(msg.as_str()) { let ptr = c.as_ptr(); *guard = Some(c); ptr } else { std::ptr::null() } } else { std::ptr::null() } }) }) } /// Get the ABI version of the library. /// /// # Returns /// /// A 32-bit unsigned integer encoding the ABI version. /// Format: MAJOR << 16 | MINOR << 8 | PATCH /// /// For version 0.1.0, this returns 0x00000100 (256 decimal). /// For version 1.2.3, this would return 0x010203 (66051 decimal). /// /// C callers can use this to verify the loaded library matches their /// compiled header's expectations. #[no_mangle] pub extern "C" fn pdftract_abi_version() -> u32 { const MAJOR: u8 = 0; const MINOR: u8 = 1; const PATCH: u8 = 0; (MAJOR as u32) << 16 | (MINOR as u32) << 8 | (PATCH as u32) } /// Verify a visual citation receipt against a PDF file. /// /// # Arguments /// /// * `path` - Path to the PDF file (null-terminated UTF-8 string) /// * `receipt_json` - JSON string containing the receipt to verify /// /// # Returns /// /// An int32_t exit code: /// - 0: receipt verifies successfully /// - 1: extraction failed (PDF unreadable, encrypted, etc.) /// - 10: pdf_fingerprint mismatch /// - 11: bbox mismatch (no span meets 90% IoU threshold) /// - 12: content_hash mismatch (best-IoU span's text differs) /// /// On error, use pdftract_last_error() to get a detailed message. #[no_mangle] pub extern "C" fn pdftract_verify_receipt(path: *const c_char, receipt_json: *const c_char) -> i32 { clear_last_error(); let result = catch_unwind(|| unsafe { let pdf_path = match cstr_to_string(path) { Ok(s) => s, Err(_) => { set_last_error(json_error( error_codes::NULL_POINTER, "path pointer is null", )); return exit_code::EXTRACTION_FAILED; } }; let receipt_str = match cstr_to_string(receipt_json) { Ok(s) => s, Err(_) => { set_last_error(json_error( error_codes::NULL_POINTER, "receipt_json pointer is null", )); return exit_code::EXTRACTION_FAILED; } }; // Parse the receipt JSON let receipt: Receipt = match serde_json::from_str(&receipt_str) { Ok(r) => r, Err(e) => { set_last_error(json_error( error_codes::INVALID_JSON, &format!("Invalid receipt JSON: {}", e), )); return exit_code::EXTRACTION_FAILED; } }; // Extract the PDF to get spans and fingerprint let pdf_path_obj = Path::new(&pdf_path); let extraction_result = match extract_pdf(pdf_path_obj, &ExtractionOptions::default()) { Ok(result) => result, Err(e) => { set_last_error(anyhow_to_json_error(e)); return exit_code::EXTRACTION_FAILED; } }; // Get the page specified in the receipt let page = if receipt.page_index < extraction_result.pages.len() { &extraction_result.pages[receipt.page_index] } else { set_last_error(json_error( error_codes::EXTRACTION_ERROR, &format!( "receipt page_index {} out of bounds (PDF has {} pages)", receipt.page_index, extraction_result.pages.len() ), )); return exit_code::EXTRACTION_FAILED; }; // Collect spans from the page let spans: Vec = page .spans .iter() .map(|span| SpanData { text: span.text.clone(), bbox: span.bbox, }) .collect(); // Verify the receipt let verify_result = verify_receipt(&receipt, &spans, &extraction_result.fingerprint); match verify_result { VerificationResult::Ok { .. } => exit_code::SUCCESS, VerificationResult::FingerprintMismatch { .. } => exit_code::FINGERPRINT_MISMATCH, VerificationResult::BboxMismatch { .. } => exit_code::BBOX_MISMATCH, VerificationResult::ContentMismatch { .. } => exit_code::CONTENT_MISMATCH, } }); match result { Ok(code) => code, Err(_) => { set_last_error(json_error( error_codes::PANIC, "panic in pdftract_verify_receipt", )); exit_code::EXTRACTION_FAILED } } } #[cfg(test)] mod tests { use super::*; use std::fs; use std::io::Write; /// Create a minimal valid PDF for testing. fn create_minimal_pdf(path: &Path) -> std::io::Result<()> { let pdf_data = br#"%PDF-1.4 1 0 obj<>endobj 2 0 obj<>endobj 3 0 obj<>>>>>>>>>endobj xref 0 4 0000000000 65535 f 0000000009 00000 n 0000000052 00000 n 0000000109 00000 n trailer<> startxref 206 %%EOF "#; let mut file = fs::File::create(path)?; file.write_all(pdf_data)?; Ok(()) } #[test] fn test_json_error() { let err = json_error("TEST_CODE", "test message"); assert!(err.contains(r#""error":"TEST_CODE""#)); assert!(err.contains(r#""message":"test message""#)); } #[test] fn test_escape_json() { let escaped = escape_json("hello\nworld\"test\\"); assert_eq!(escaped, "hello\\nworld\\\"test\\\\"); } #[test] fn test_pdftract_version_not_null() { let version = unsafe { CStr::from_ptr(pdftract_version()).to_str().unwrap() }; assert!(!version.is_empty()); } }