From ef9c03095d65393ee873691e5dbc94dd7c7b5fcc Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 16 May 2026 14:51:25 -0400 Subject: [PATCH] Add SDK architecture notes covering top 10 languages Covers TypeScript, C#, C++, PHP, and Kotlin gaps with full code examples for both subprocess and HTTP tracks, NuGet RID packaging detail, PHP FFI options, and implementation sequencing. Co-Authored-By: Claude Sonnet 4.6 --- docs/notes/sdk-architecture.md | 579 +++++++++++++++++++++++++++++++++ 1 file changed, 579 insertions(+) create mode 100644 docs/notes/sdk-architecture.md diff --git a/docs/notes/sdk-architecture.md b/docs/notes/sdk-architecture.md new file mode 100644 index 0000000..f7e53a2 --- /dev/null +++ b/docs/notes/sdk-architecture.md @@ -0,0 +1,579 @@ +# SDK Architecture and Language Coverage + +## Top 10 language coverage status + +Based on Stack Overflow 2024 survey rankings. The existing `sdk-invocation.md` covers Python, +JavaScript, Go, Ruby, Java, Rust, and Bash. Gaps: TypeScript, C#, C++, PHP, and Kotlin. + +--- + +## Common infrastructure (required before any SDK ships) + +### Binary distribution + +Every SDK approach — subprocess or native — depends on platform binaries published to GitHub Releases: + +| Target triple | Platform | +|---|---| +| `x86_64-unknown-linux-gnu` | Linux x86_64 | +| `aarch64-unknown-linux-gnu` | Linux ARM64 | +| `x86_64-apple-darwin` | macOS Intel | +| `aarch64-apple-darwin` | macOS Apple Silicon | +| `x86_64-pc-windows-msvc` | Windows x86_64 | + +The CI workflow must cross-compile for all five targets and attach the binaries to a versioned +GitHub Release tag on every release. SDKs pin to a binary version and download the appropriate +artifact at install time. + +### Release format + +``` +https://github.com/jedarden/pdftract/releases/download/v{VERSION}/pdftract-{TARGET}.tar.gz +``` + +Semantic versioning is required before any package is published to a package registry. + +--- + +## Two SDK tracks + +### Track A — Subprocess / HTTP wrappers + +Each SDK ships a thin wrapper that: +1. Downloads and caches the platform binary on first use (or at install time) +2. Invokes it via subprocess for one-off extractions +3. Optionally connects to a `pdftract serve` instance over HTTP for high-throughput use + +**Tradeoffs:** Fast to implement for any language, no FFI complexity, slight per-call overhead +from process spawn. Acceptable for batch and interactive workloads. + +### Track B — Native bindings + +The Rust core exposes a C ABI via `cbindgen`. Each language calls into the compiled shared +library directly, bypassing subprocess entirely. + +**Requires:** +- `cdylib` crate type in `Cargo.toml` +- `cbindgen` generating a `pdftract.h` C header +- A `#[no_mangle] extern "C"` public API surface in the Rust core +- Per-language FFI glue + +**Tradeoffs:** Zero process-spawn overhead, suitable for embedding in long-running services, +but requires per-language binding work and platform-specific shared library distribution. + +**Starting recommendation:** Track A for all languages, Track B for Python first (PyO3 is +mature, Python is the highest-volume use case for RAG and LLM preprocessing pipelines). + +--- + +## Per-language breakdown + +| Language | Package Manager | Track A | Track B | Status | +|---|---|---|---|---| +| Python | PyPI | `subprocess` | PyO3 + maturin | Covered | +| JavaScript | npm | `child_process` | napi-rs | Covered | +| TypeScript | npm | Same as JS | Same + `.d.ts` types | **Gap — types only** | +| Java | Maven Central | `ProcessBuilder` | JNI via `jni` crate | Covered | +| C# | NuGet | `System.Diagnostics.Process` | P/Invoke via cbindgen | **Gap** | +| C++ | vcpkg / conan | `popen` | cbindgen → `.h` + shared lib | **Gap** | +| Go | Go modules | `os/exec` | cgo + cbindgen | Covered | +| PHP | Packagist | `proc_open` | ext-php-rs or PHP FFI | **Gap** | +| Kotlin | Maven Central | `ProcessBuilder` (JVM) | JNI (same as Java) | **Gap** | +| Rust | crates.io | `std::process::Command` | native library crate | Covered | + +--- + +## Gap detail + +### TypeScript + +Minimal work on top of the existing JavaScript notes. The implementation is identical — add a +`pdftract.d.ts` type definition file and publish to npm as `@pdftract/sdk` or alongside the JS +package. Types to define: + +```typescript +export interface Span { + text: string; + bbox: [number, number, number, number]; + font: string; + size: number; + confidence: number; +} + +export interface Block { + kind: 'paragraph' | 'heading' | 'table' | 'figure' | 'list'; + text: string; + bbox: [number, number, number, number]; +} + +export interface Page { + page: number; + spans: Span[]; + blocks: Block[]; +} + +export interface ExtractionResult { + pages: Page[]; + metadata: { + title?: string; + author?: string; + page_count: number; + }; +} + +export function extract(filePath: string): Promise; +export function extractText(filePath: string): Promise; +export function extractPage(filePath: string, page: number): Promise; +export function createClient(baseUrl: string): PdftractClient; + +export class PdftractClient { + extract(filePath: string): Promise; + extractText(filePath: string): Promise; +} +``` + +--- + +### C# / .NET + +**Track A — subprocess:** + +```csharp +using System.Diagnostics; +using System.Text.Json; + +public class PdftractClient +{ + private readonly string _binaryPath; + + public PdftractClient(string binaryPath = "pdftract") + { + _binaryPath = binaryPath; + } + + public async Task ExtractAsync(string pdfPath) + { + using var process = new Process + { + StartInfo = new ProcessStartInfo + { + FileName = _binaryPath, + Arguments = $"extract \"{pdfPath}\"", + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + } + }; + + process.Start(); + string stdout = await process.StandardOutput.ReadToEndAsync(); + await process.WaitForExitAsync(); + + if (process.ExitCode != 0) + { + string stderr = await process.StandardError.ReadToEndAsync(); + throw new PdftractException($"pdftract exited {process.ExitCode}: {stderr}"); + } + + return JsonSerializer.Deserialize(stdout) + ?? throw new PdftractException("Empty response"); + } + + public async Task ExtractTextAsync(string pdfPath) + { + var result = await ExtractAsync(pdfPath); + return string.Join("\n\n", result.Pages.Select(p => + string.Join("\n", p.Blocks.Select(b => b.Text)))); + } +} +``` + +**Track A — HTTP:** + +```csharp +using System.Net.Http.Headers; + +public class PdftractHttpClient : IDisposable +{ + private readonly HttpClient _http; + private readonly string _baseUrl; + + public PdftractHttpClient(string baseUrl = "http://localhost:8080") + { + _http = new HttpClient(); + _baseUrl = baseUrl; + } + + public async Task ExtractAsync(string pdfPath) + { + using var form = new MultipartFormDataContent(); + var fileBytes = await File.ReadAllBytesAsync(pdfPath); + var fileContent = new ByteArrayContent(fileBytes); + fileContent.Headers.ContentType = MediaTypeHeaderValue.Parse("application/pdf"); + form.Add(fileContent, "file", Path.GetFileName(pdfPath)); + + var response = await _http.PostAsync($"{_baseUrl}/extract", form); + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadAsStringAsync(); + return JsonSerializer.Deserialize(json) + ?? throw new PdftractException("Empty response"); + } + + public void Dispose() => _http.Dispose(); +} +``` + +**Track B — native (P/Invoke):** + +Requires `cbindgen` to generate `pdftract.h`, then: + +```csharp +using System.Runtime.InteropServices; + +internal static class NativeMethods +{ + private const string LibName = "pdftract"; + + [DllImport(LibName, EntryPoint = "pdftract_extract_file")] + internal static extern IntPtr ExtractFile( + [MarshalAs(UnmanagedType.LPUTF8Str)] string path); + + [DllImport(LibName, EntryPoint = "pdftract_free_result")] + internal static extern void FreeResult(IntPtr result); +} +``` + +**NuGet packaging** — the `.nupkg` must embed the shared library per Runtime Identifier: + +``` +lib/ + net8.0/ + Pdftract.dll +runtimes/ + linux-x64/native/libpdftract.so + linux-arm64/native/libpdftract.so + osx-x64/native/libpdftract.dylib + osx-arm64/native/libpdftract.dylib + win-x64/native/pdftract.dll +``` + +The `.csproj` sets `RuntimeIdentifiers` and uses `` to map each binary into the +correct runtime folder. This is the primary complexity in C# packaging. + +--- + +### C++ + +**Track A — subprocess:** + +```cpp +#include +#include +#include +#include +#include + +std::string pdftract_extract_json(const std::string& pdf_path) { + std::string cmd = "pdftract extract \"" + pdf_path + "\""; + std::array buf{}; + std::string result; + + std::unique_ptr pipe(popen(cmd.c_str(), "r"), pclose); + if (!pipe) throw std::runtime_error("popen failed"); + + while (fgets(buf.data(), buf.size(), pipe.get()) != nullptr) + result += buf.data(); + + return result; +} + +std::string pdftract_extract_text(const std::string& pdf_path) { + std::string cmd = "pdftract extract --text \"" + pdf_path + "\""; + std::array buf{}; + std::string result; + + std::unique_ptr pipe(popen(cmd.c_str(), "r"), pclose); + if (!pipe) throw std::runtime_error("popen failed"); + + while (fgets(buf.data(), buf.size(), pipe.get()) != nullptr) + result += buf.data(); + + return result; +} +``` + +**Track A — HTTP (using libcurl):** + +```cpp +#include +#include + +static size_t write_cb(char* ptr, size_t size, size_t nmemb, std::string* data) { + data->append(ptr, size * nmemb); + return size * nmemb; +} + +std::string pdftract_http_extract(const std::string& pdf_path, + const std::string& base_url = "http://localhost:8080") { + CURL* curl = curl_easy_init(); + if (!curl) throw std::runtime_error("curl_easy_init failed"); + + std::string response; + curl_mime* mime = curl_mime_init(curl); + curl_mimepart* part = curl_mime_addpart(mime); + curl_mime_name(part, "file"); + curl_mime_filedata(part, pdf_path.c_str()); + + curl_easy_setopt(curl, CURLOPT_URL, (base_url + "/extract").c_str()); + curl_easy_setopt(curl, CURLOPT_MIMEPOST, mime); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); + + CURLcode res = curl_easy_perform(curl); + curl_mime_free(mime); + curl_easy_cleanup(curl); + + if (res != CURLE_OK) + throw std::runtime_error(curl_easy_strerror(res)); + + return response; +} +``` + +**Track B — native:** `cbindgen` generates `pdftract.h`; link against `libpdftract.so` / +`pdftract.dll`. Distribute as a vcpkg port or conan recipe with the header and shared library. +No standard package manager — provide both options. + +--- + +### PHP + +**Track A — subprocess:** + +```php +binaryPath) + . ' extract ' + . escapeshellarg($pdfPath); + + $descriptors = [ + 1 => ['pipe', 'w'], + 2 => ['pipe', 'w'], + ]; + + $proc = proc_open($cmd, $descriptors, $pipes); + if (!is_resource($proc)) { + throw new RuntimeException('Failed to start pdftract'); + } + + $stdout = stream_get_contents($pipes[1]); + $stderr = stream_get_contents($pipes[2]); + fclose($pipes[1]); + fclose($pipes[2]); + $exit = proc_close($proc); + + if ($exit !== 0) { + throw new RuntimeException("pdftract exited $exit: $stderr"); + } + + return json_decode($stdout, true, 512, JSON_THROW_ON_ERROR); + } + + public function extractText(string $pdfPath): string + { + $result = $this->extract($pdfPath); + $lines = []; + foreach ($result['pages'] as $page) { + foreach ($page['blocks'] as $block) { + $lines[] = $block['text']; + } + } + return implode("\n\n", $lines); + } + + public function extractPage(string $pdfPath, int $page): array + { + $result = $this->extract($pdfPath); + foreach ($result['pages'] as $p) { + if ($p['page'] === $page) return $p; + } + throw new OutOfRangeException("Page $page not found"); + } +} +``` + +**Track A — HTTP:** + +```php +baseUrl . '/extract'); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => ['file' => new CURLFile($pdfPath, 'application/pdf')], + ]); + + $response = curl_exec($ch); + $status = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($status !== 200) { + throw new RuntimeException("HTTP $status from pdftract serve"); + } + + return json_decode($response, true, 512, JSON_THROW_ON_ERROR); + } + + public function extractText(string $pdfPath): string + { + $result = $this->extract($pdfPath); + $lines = array_map( + fn($page) => implode("\n", array_column($page['blocks'], 'text')), + $result['pages'] + ); + return implode("\n\n", $lines); + } +} +``` + +**Track B — native:** `ext-php-rs` compiles a PHP extension in Rust directly. Alternatively, +PHP 8+ FFI (`FFI::load`) can call into a C ABI shared library without writing a C extension. +The FFI approach is easier to distribute but has higher per-call overhead than a compiled +extension. + +**Distribution:** Composer package (`packagist.org`). The package downloads the platform binary +in a post-install script. PHP extension distribution requires `pecl` and per-version compilation, +which is significant maintenance overhead — subprocess Track A is the right starting point. + +--- + +### Kotlin + +The JVM is shared with Java, so the implementation is the same `ProcessBuilder` and +`java.net.http.HttpClient` approach. The Kotlin wrapper adds idiomatic sugar: coroutines for +async, extension functions, and data classes for the JSON model. + +**Subprocess:** + +```kotlin +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.withContext +import kotlinx.serialization.Serializable +import kotlinx.serialization.json.Json +import java.io.File + +@Serializable +data class Span(val text: String, val bbox: List, val font: String, + val size: Double, val confidence: Double) + +@Serializable +data class Block(val kind: String, val text: String, val bbox: List) + +@Serializable +data class Page(val page: Int, val spans: List, val blocks: List) + +@Serializable +data class Metadata(val title: String? = null, val author: String? = null, + val page_count: Int) + +@Serializable +data class ExtractionResult(val pages: List, val metadata: Metadata) + +class Pdftract(private val binaryPath: String = "pdftract") { + + private val json = Json { ignoreUnknownKeys = true } + + suspend fun extract(pdfPath: String): ExtractionResult = withContext(Dispatchers.IO) { + val process = ProcessBuilder(binaryPath, "extract", pdfPath) + .redirectErrorStream(false) + .start() + + val stdout = process.inputStream.bufferedReader().readText() + val stderr = process.errorStream.bufferedReader().readText() + val exit = process.waitFor() + + if (exit != 0) throw RuntimeException("pdftract exited $exit: $stderr") + json.decodeFromString(stdout) + } + + suspend fun extractText(pdfPath: String): String = + extract(pdfPath).pages + .flatMap { it.blocks } + .joinToString("\n\n") { it.text } + + suspend fun extractPage(pdfPath: String, page: Int): Page = + extract(pdfPath).pages.first { it.page == page } +} +``` + +**HTTP:** + +```kotlin +import io.ktor.client.* +import io.ktor.client.request.forms.* +import io.ktor.client.statement.* +import io.ktor.http.* +import java.io.File + +class PdftractHttpClient( + private val baseUrl: String = "http://localhost:8080", + private val client: HttpClient = HttpClient() +) { + private val json = Json { ignoreUnknownKeys = true } + + suspend fun extract(pdfPath: String): ExtractionResult { + val file = File(pdfPath) + val response: HttpResponse = client.submitFormWithBinaryData( + url = "$baseUrl/extract", + formData = formData { + append("file", file.readBytes(), Headers.build { + append(HttpHeaders.ContentType, "application/pdf") + append(HttpHeaders.ContentDisposition, "filename=\"${file.name}\"") + }) + } + ) + return json.decodeFromString(response.bodyAsText()) + } + + suspend fun extractText(pdfPath: String): String = + extract(pdfPath).pages + .flatMap { it.blocks } + .joinToString("\n\n") { it.text } +} +``` + +**Distribution:** Maven Central, same artifact group as the Java package (`com.pdftract`). +Separate artifact ID (`pdftract-kotlin`) so Java users don't pull in Kotlin stdlib. + +--- + +## Implementation sequencing + +| Priority | Language | Effort | Rationale | +|---|---|---|---| +| 1 | TypeScript | Half a day | Type definitions on top of existing JS code | +| 2 | Kotlin | Half a day | JVM wrapper on top of existing Java code | +| 3 | C# | 1–2 days | Subprocess is straightforward; NuGet RID packaging is the complexity | +| 4 | PHP | 1 day | Composer subprocess wrapper; avoid extension track initially | +| 5 | C++ | 1–2 days | `popen` + libcurl; no package manager standard, distribute as vcpkg port | + +All five are blocked on the GitHub Releases binary distribution infrastructure being in place first.