- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream - Change BytesSource from []byte type to struct with data and tmpPath fields - Add proper cleanup of temporary files after subprocess execution - Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify This ensures BytesSource temporary files are cleaned up after use, preventing file descriptor leaks. The BytesSource now creates a temp file on demand and cleans it up automatically via defer in the invoke methods. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
180 lines
4.6 KiB
Go
180 lines
4.6 KiB
Go
package pdftract
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
)
|
|
|
|
// Client represents a pdftract SDK client.
|
|
type Client struct {
|
|
binaryPath string
|
|
}
|
|
|
|
// NewClient creates a new Client with the pdftract binary at the given path.
|
|
// If path is empty, it searches for pdftract in PATH.
|
|
func NewClient(path string) (*Client, error) {
|
|
binaryPath := path
|
|
if binaryPath == "" {
|
|
path, err := exec.LookPath("pdftract")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("pdftract binary not found in PATH: %w", err)
|
|
}
|
|
binaryPath = path
|
|
}
|
|
|
|
// Verify the binary exists and is executable
|
|
if _, err := os.Stat(binaryPath); err != nil {
|
|
return nil, fmt.Errorf("pdftract binary not found at %s: %w", binaryPath, err)
|
|
}
|
|
|
|
return &Client{
|
|
binaryPath: binaryPath,
|
|
}, nil
|
|
}
|
|
|
|
// MustNewClient creates a new Client and panics if it fails.
|
|
// Useful for short-lived programs where the binary path is known.
|
|
func MustNewClient(path string) *Client {
|
|
client, err := NewClient(path)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return client
|
|
}
|
|
|
|
// Extract extracts structured data from a PDF.
|
|
func (c *Client) Extract(ctx context.Context, source Source, opts *ExtractOptions) (*Document, error) {
|
|
args := []string{"extract", "--json"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
var doc Document
|
|
if err := c.invokeJSON(ctx, args, &doc, source); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &doc, nil
|
|
}
|
|
|
|
// ExtractText extracts plain text from a PDF.
|
|
func (c *Client) ExtractText(ctx context.Context, source Source, opts *ExtractOptions) (string, error) {
|
|
args := []string{"extract", "--text"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
return c.invokeString(ctx, args, source)
|
|
}
|
|
|
|
// ExtractMarkdown extracts markdown-formatted text from a PDF.
|
|
func (c *Client) ExtractMarkdown(ctx context.Context, source Source, opts *ExtractOptions) (string, error) {
|
|
args := []string{"extract", "--md"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
return c.invokeString(ctx, args, source)
|
|
}
|
|
|
|
// ExtractStream extracts pages from a PDF as a stream.
|
|
func (c *Client) ExtractStream(ctx context.Context, source Source, opts *ExtractOptions) (<-chan PageResult, error) {
|
|
return c.extractStream(ctx, source, opts)
|
|
}
|
|
|
|
// Search searches for a pattern in a PDF.
|
|
func (c *Client) Search(ctx context.Context, source Source, pattern string, opts *SearchOptions) (<-chan MatchResult, error) {
|
|
return c.search(ctx, source, pattern, opts)
|
|
}
|
|
|
|
// GetMetadata extracts metadata from a PDF.
|
|
func (c *Client) GetMetadata(ctx context.Context, source Source, opts *ExtractOptions) (*Metadata, error) {
|
|
args := []string{"extract", "--metadata-only"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
var result struct {
|
|
Metadata Metadata `json:"metadata"`
|
|
}
|
|
if err := c.invokeJSON(ctx, args, &result, source); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &result.Metadata, nil
|
|
}
|
|
|
|
// Hash computes the fingerprint hash of a PDF.
|
|
func (c *Client) Hash(ctx context.Context, source Source, opts *HashOptions) (*Fingerprint, error) {
|
|
args := []string{"hash"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
var fp Fingerprint
|
|
if err := c.invokeJSON(ctx, args, &fp, source); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &fp, nil
|
|
}
|
|
|
|
// Classify classifies a PDF document.
|
|
func (c *Client) Classify(ctx context.Context, source Source) (*Classification, error) {
|
|
args := []string{"classify"}
|
|
args = append(args, source.source()...)
|
|
|
|
var cls Classification
|
|
if err := c.invokeJSON(ctx, args, &cls, source); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &cls, nil
|
|
}
|
|
|
|
// VerifyReceipt verifies a cryptographic receipt for a PDF.
|
|
func (c *Client) VerifyReceipt(ctx context.Context, path string, receipt *Receipt) (bool, error) {
|
|
receiptPath := path + ".receipt.json"
|
|
|
|
// For now, we'll call the CLI with the receipt path
|
|
// TODO: Implement proper receipt verification once the CLI supports it
|
|
args := []string{"verify-receipt", path, receiptPath}
|
|
|
|
_, err := c.invoke(ctx, args)
|
|
if err != nil {
|
|
if _, ok := err.(*ReceiptVerifyError); ok {
|
|
// Receipt verification failed
|
|
return false, nil
|
|
}
|
|
return false, err
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// BinaryPath returns the path to the pdftract binary.
|
|
func (c *Client) BinaryPath() string {
|
|
return c.binaryPath
|
|
}
|
|
|
|
// Version returns the pdftract binary version.
|
|
func (c *Client) Version(ctx context.Context) (string, error) {
|
|
args := []string{"--version"}
|
|
output, err := c.invoke(ctx, args)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(output), nil
|
|
}
|