- Add build.rs that generates compile-time std14 metrics from JSON - Add std14.rs module with Std14Metrics struct and get_std14_metrics() - Add build/std14-metrics.json with AFM-derived widths for all 14 fonts - Re-export Std14Metrics, NamedEncoding, get_std14_metrics in lib.rs Acceptance criteria: - All 14 Standard fonts (Courier, Helvetica, Times, Symbol, ZapfDingbats and their variants) return valid metrics from the registry - Subset-prefixed names (ABCDEF+Helvetica) resolve via strip_subset_prefix() - Width tables match Adobe AFM data within rounding tolerance - Binary footprint < 60 KB (generated source: 20 KB, actual data ~8 KB) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
282 lines
7.4 KiB
C
282 lines
7.4 KiB
C
/* Copyright 2026 Jed Cabanino. MIT OR Apache-2.0 */
|
|
|
|
#ifndef PDFTRACT_H
|
|
#define PDFTRACT_H
|
|
|
|
#pragma once
|
|
|
|
#include <stdarg.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif // __cplusplus
|
|
|
|
/**
|
|
* Get the ABI version of the library.
|
|
*
|
|
* # Returns
|
|
*
|
|
* A 32-bit unsigned integer encoding the ABI version.
|
|
* Format: MAJOR << 16 | MINOR << 8 | PATCH
|
|
*
|
|
* For version 0.1.0, this returns 0x00000100 (256 decimal).
|
|
* For version 1.2.3, this would return 0x010203 (66051 decimal).
|
|
*
|
|
* C callers can use this to verify the loaded library matches their
|
|
* compiled header's expectations.
|
|
*/
|
|
uint32_t pdftract_abi_version(void);
|
|
|
|
/**
|
|
* Classify a PDF file by type.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing classification information. The caller MUST free this
|
|
* with pdftract_free().
|
|
*
|
|
* # Note
|
|
*
|
|
* This is currently a stub that returns a basic classification.
|
|
* Full implementation requires a trained classifier.
|
|
*/
|
|
char *pdftract_classify(const char *source);
|
|
|
|
/**
|
|
* Extract text and structure from a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string representing the extraction result. The caller MUST free this
|
|
* with pdftract_free(). On error, returns a JSON object with "error" and "message" fields.
|
|
*
|
|
* # Example
|
|
*
|
|
* ```c
|
|
* char *result = pdftract_extract("document.pdf", "{}");
|
|
* // ... use result ...
|
|
* pdftract_free(result);
|
|
* ```
|
|
*/
|
|
char *pdftract_extract(const char *source,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Extract markdown from a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing the extracted markdown. The caller MUST free this
|
|
* with pdftract_free().
|
|
*/
|
|
char *pdftract_extract_markdown(const char *source,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Open a streaming extraction session.
|
|
*
|
|
* Returns an opaque handle that can be used with pdftract_stream_next()
|
|
* to iterate through pages one at a time. When done, call pdftract_stream_close().
|
|
*
|
|
* # Memory Efficiency
|
|
*
|
|
* This function does NOT materialize all pages. It creates a PdfExtractor
|
|
* that will extract each page on-demand when pdftract_stream_next() is called.
|
|
* This ensures memory usage stays bounded regardless of document size.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* An opaque handle (*mut c_void) on success, or NULL on error.
|
|
* Check for errors by examining the handle.
|
|
*/
|
|
void *pdftract_extract_stream_open(const char *source,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Extract plain text from a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing the extracted text. The caller MUST free this
|
|
* with pdftract_free().
|
|
*/
|
|
char *pdftract_extract_text(const char *source,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Free a string returned by pdftract_* functions.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `ptr` - Pointer to string returned by any pdftract_* function (except pdftract_version)
|
|
*
|
|
* # Safety
|
|
*
|
|
* This function MUST be called to free strings returned by the API.
|
|
* Do NOT call libc free() on these pointers.
|
|
*/
|
|
void pdftract_free(char *ptr);
|
|
|
|
/**
|
|
* Get metadata about a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing PDF metadata. The caller MUST free this
|
|
* with pdftract_free().
|
|
*/
|
|
char *pdftract_get_metadata(const char *source,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Compute the cryptographic fingerprint of a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing the fingerprint. The caller MUST free this
|
|
* with pdftract_free().
|
|
*/
|
|
char *pdftract_hash(const char *source);
|
|
|
|
/**
|
|
* Get the last error message for the current thread.
|
|
*
|
|
* # Returns
|
|
*
|
|
* A pointer to a null-terminated string containing the last error message,
|
|
* or NULL if no error has been set. The caller MUST NOT free this string.
|
|
* The string remains valid until the next API call on this thread.
|
|
*
|
|
* # Note
|
|
*
|
|
* This function returns a pointer to thread-local storage that is invalidated
|
|
* by the next API call on the same thread. If you need to retain the error
|
|
* message, make a copy of it immediately.
|
|
*/
|
|
const char *pdftract_last_error(void);
|
|
|
|
/**
|
|
* Search for text patterns in a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `source` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `pattern` - Search pattern (null-terminated UTF-8 string)
|
|
* * `options_json` - JSON string with extraction options (can be empty object "{}")
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string containing search results. The caller MUST free this
|
|
* with pdftract_free().
|
|
*/
|
|
char *pdftract_search(const char *source,
|
|
const char *pattern,
|
|
const char *options_json);
|
|
|
|
/**
|
|
* Close a streaming extraction session and free resources.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `handle` - Opaque handle from pdftract_extract_stream_open()
|
|
*/
|
|
void pdftract_stream_close(void *handle);
|
|
|
|
/**
|
|
* Get the next page from a streaming extraction session.
|
|
*
|
|
* # Memory Efficiency
|
|
*
|
|
* This function extracts one page at a time on-demand. The page's
|
|
* content streams are decoded, the result is serialized to JSON,
|
|
* and then all page data is dropped before returning. This ensures
|
|
* memory usage stays bounded.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `handle` - Opaque handle from pdftract_extract_stream_open()
|
|
*
|
|
* # Returns
|
|
*
|
|
* A JSON string representing one page, or NULL when the stream ends.
|
|
* The caller MUST free non-NULL returns with pdftract_free().
|
|
*
|
|
* # Note
|
|
*
|
|
* The handle remains valid after this call and must be closed with
|
|
* pdftract_stream_close() when done.
|
|
*/
|
|
char *pdftract_stream_next(void *handle);
|
|
|
|
/**
|
|
* Verify a visual citation receipt against a PDF file.
|
|
*
|
|
* # Arguments
|
|
*
|
|
* * `path` - Path to the PDF file (null-terminated UTF-8 string)
|
|
* * `receipt_json` - JSON string containing the receipt to verify
|
|
*
|
|
* # Returns
|
|
*
|
|
* An int32_t exit code:
|
|
* - 0: receipt verifies successfully
|
|
* - 1: extraction failed (PDF unreadable, encrypted, etc.)
|
|
* - 10: pdf_fingerprint mismatch
|
|
* - 11: bbox mismatch (no span meets 90% IoU threshold)
|
|
* - 12: content_hash mismatch (best-IoU span's text differs)
|
|
*
|
|
* On error, use pdftract_last_error() to get a detailed message.
|
|
*/
|
|
int32_t pdftract_verify_receipt(const char *path,
|
|
const char *receipt_json);
|
|
|
|
/**
|
|
* Get the pdftract library version string.
|
|
*
|
|
* # Returns
|
|
*
|
|
* A static C string containing the version. Do NOT free this string.
|
|
*/
|
|
const char *pdftract_version(void);
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
#endif // __cplusplus
|
|
|
|
#endif /* PDFTRACT_H */
|