pdftract/pdftract-go/pdftract.go
jedarden 5781d67d5c fix(pdftract-2pyln): add source parameter to invoke methods for BytesSource cleanup
- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream
- Change BytesSource from []byte type to struct with data and tmpPath fields
- Add proper cleanup of temporary files after subprocess execution
- Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify

This ensures BytesSource temporary files are cleaned up after use, preventing
file descriptor leaks. The BytesSource now creates a temp file on demand and
cleans it up automatically via defer in the invoke methods.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 19:08:14 -04:00

180 lines
4.6 KiB
Go

package pdftract
import (
"context"
"fmt"
"os"
"os/exec"
)
// Client represents a pdftract SDK client.
type Client struct {
binaryPath string
}
// NewClient creates a new Client with the pdftract binary at the given path.
// If path is empty, it searches for pdftract in PATH.
func NewClient(path string) (*Client, error) {
binaryPath := path
if binaryPath == "" {
path, err := exec.LookPath("pdftract")
if err != nil {
return nil, fmt.Errorf("pdftract binary not found in PATH: %w", err)
}
binaryPath = path
}
// Verify the binary exists and is executable
if _, err := os.Stat(binaryPath); err != nil {
return nil, fmt.Errorf("pdftract binary not found at %s: %w", binaryPath, err)
}
return &Client{
binaryPath: binaryPath,
}, nil
}
// MustNewClient creates a new Client and panics if it fails.
// Useful for short-lived programs where the binary path is known.
func MustNewClient(path string) *Client {
client, err := NewClient(path)
if err != nil {
panic(err)
}
return client
}
// Extract extracts structured data from a PDF.
func (c *Client) Extract(ctx context.Context, source Source, opts *ExtractOptions) (*Document, error) {
args := []string{"extract", "--json"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
var doc Document
if err := c.invokeJSON(ctx, args, &doc, source); err != nil {
return nil, err
}
return &doc, nil
}
// ExtractText extracts plain text from a PDF.
func (c *Client) ExtractText(ctx context.Context, source Source, opts *ExtractOptions) (string, error) {
args := []string{"extract", "--text"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
return c.invokeString(ctx, args, source)
}
// ExtractMarkdown extracts markdown-formatted text from a PDF.
func (c *Client) ExtractMarkdown(ctx context.Context, source Source, opts *ExtractOptions) (string, error) {
args := []string{"extract", "--md"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
return c.invokeString(ctx, args, source)
}
// ExtractStream extracts pages from a PDF as a stream.
func (c *Client) ExtractStream(ctx context.Context, source Source, opts *ExtractOptions) (<-chan PageResult, error) {
return c.extractStream(ctx, source, opts)
}
// Search searches for a pattern in a PDF.
func (c *Client) Search(ctx context.Context, source Source, pattern string, opts *SearchOptions) (<-chan MatchResult, error) {
return c.search(ctx, source, pattern, opts)
}
// GetMetadata extracts metadata from a PDF.
func (c *Client) GetMetadata(ctx context.Context, source Source, opts *ExtractOptions) (*Metadata, error) {
args := []string{"extract", "--metadata-only"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
var result struct {
Metadata Metadata `json:"metadata"`
}
if err := c.invokeJSON(ctx, args, &result, source); err != nil {
return nil, err
}
return &result.Metadata, nil
}
// Hash computes the fingerprint hash of a PDF.
func (c *Client) Hash(ctx context.Context, source Source, opts *HashOptions) (*Fingerprint, error) {
args := []string{"hash"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
var fp Fingerprint
if err := c.invokeJSON(ctx, args, &fp, source); err != nil {
return nil, err
}
return &fp, nil
}
// Classify classifies a PDF document.
func (c *Client) Classify(ctx context.Context, source Source) (*Classification, error) {
args := []string{"classify"}
args = append(args, source.source()...)
var cls Classification
if err := c.invokeJSON(ctx, args, &cls, source); err != nil {
return nil, err
}
return &cls, nil
}
// VerifyReceipt verifies a cryptographic receipt for a PDF.
func (c *Client) VerifyReceipt(ctx context.Context, path string, receipt *Receipt) (bool, error) {
receiptPath := path + ".receipt.json"
// For now, we'll call the CLI with the receipt path
// TODO: Implement proper receipt verification once the CLI supports it
args := []string{"verify-receipt", path, receiptPath}
_, err := c.invoke(ctx, args)
if err != nil {
if _, ok := err.(*ReceiptVerifyError); ok {
// Receipt verification failed
return false, nil
}
return false, err
}
return true, nil
}
// BinaryPath returns the path to the pdftract binary.
func (c *Client) BinaryPath() string {
return c.binaryPath
}
// Version returns the pdftract binary version.
func (c *Client) Version(ctx context.Context) (string, error) {
args := []string{"--version"}
output, err := c.invoke(ctx, args)
if err != nil {
return "", err
}
return string(output), nil
}