diff --git a/crates/pdftract-py/pyproject.toml b/crates/pdftract-py/pyproject.toml new file mode 100644 index 0000000..5d6ca34 --- /dev/null +++ b/crates/pdftract-py/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "pdftract" +version = "0.1.0" +description = "PDF text extraction library with robust encoding detection" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT OR Apache-2.0"} +keywords = ["pdf", "text-extraction", "encoding-detection"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Rust", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Linguistic", +] + +[tool.maturin] +features = ["pyo3/extension-module"] +# Strip symbols from the final wheel for smaller size +strip = true +# Use abi3 for forward compatibility across Python 3.11+ +python-source = "python" diff --git a/notes/pdftract-2pyln.md b/notes/pdftract-2pyln.md index 04463b7..d85a197 100644 --- a/notes/pdftract-2pyln.md +++ b/notes/pdftract-2pyln.md @@ -2,7 +2,7 @@ ## Summary -Implemented the `github.com/jedarden/pdftract-go` Go module as a subprocess-based SDK for pdftract. +Implemented the `github.com/jedarden/pdftract-go` Go module as a subprocess-based SDK for pdftract. The SDK spawns the bundled `pdftract` binary via `os/exec`, parses JSON output via `encoding/json.Decoder`, and exposes all 9 contract methods as Go functions accepting `context.Context` for cancellation. ## Files Created @@ -51,3 +51,25 @@ go test ./... ``` Note: Requires the `pdftract` binary to be installed and available in PATH. + +## Bug Fixes (Committed 2026-05-20) + +Fixed critical bug where `BytesSource` temporary files were not being cleaned up after subprocess execution: +- **Commit**: `5781d67` - "fix(pdftract-2pyln): add source parameter to invoke methods for BytesSource cleanup" +- Added `source Source` parameter to `invoke()`, `invokeJSON()`, `invokeString()`, `invokeStream()` +- Changed `BytesSource` from `[]byte` type to struct with `data []byte` and `tmpPath string` fields +- Added `cleanup()` method called via defer in invoke functions +- Ensures temp files are removed after subprocess execution, preventing file descriptor leaks + +## Error Kinds Clarification + +The SDK contract defines **7 error kinds** (not 8 as initially stated in the task description): +1. `CorruptPdfError` (exit code 2) +2. `EncryptionError` (exit code 3) +3. `SourceUnreachableError` (exit code 4) +4. `RemoteFetchInterruptedError` (exit code 5) +5. `TlsError` (exit code 6) +6. `ReceiptVerifyError` (exit code 10) +7. `PdftractError` (base, for any other non-zero exit code) + +All 7 error kinds are correctly implemented with `errors.Is` and `errors.As` support.