- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream - Change BytesSource from []byte type to struct with data and tmpPath fields - Add proper cleanup of temporary files after subprocess execution - Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify This ensures BytesSource temporary files are cleaned up after use, preventing file descriptor leaks. The BytesSource now creates a temp file on demand and cleans it up automatically via defer in the invoke methods. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
173 lines
4 KiB
Go
173 lines
4 KiB
Go
package pdftract
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os/exec"
|
|
"sync"
|
|
)
|
|
|
|
// subprocessResult holds the result of a subprocess execution.
|
|
type subprocessResult struct {
|
|
output []byte
|
|
err error
|
|
}
|
|
|
|
// invoke executes the pdftract binary with the given arguments and context.
|
|
// It returns the combined stdout/stderr output and any error that occurred.
|
|
func (c *Client) invoke(ctx context.Context, args []string, source Source) ([]byte, error) {
|
|
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
|
|
|
// Set up cancellation to kill the process
|
|
if ctx.Done() != nil {
|
|
var once sync.Once
|
|
cmd.Cancel = func() error {
|
|
once.Do(func() {
|
|
if cmd.Process != nil {
|
|
cmd.Process.Kill()
|
|
}
|
|
})
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Ensure cleanup of BytesSource temp files
|
|
if bs, ok := source.(*BytesSource); ok {
|
|
defer bs.cleanup()
|
|
}
|
|
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
if ctx.Err() != nil {
|
|
return nil, fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
}
|
|
|
|
// Map exit codes to specific error types
|
|
if exitErr, ok := err.(*exec.ExitError); ok {
|
|
exitCode := exitErr.ExitCode()
|
|
stderr := string(output)
|
|
return nil, mapExitCodeToError(exitCode, stderr)
|
|
}
|
|
|
|
return nil, fmt.Errorf("pdftract execution failed: %w", err)
|
|
}
|
|
|
|
return output, nil
|
|
}
|
|
|
|
// invokeJSON executes the pdftract binary and parses the output as JSON.
|
|
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}, source Source) error {
|
|
output, err := c.invoke(ctx, args, source)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := json.Unmarshal(output, result); err != nil {
|
|
return &PdftractError{
|
|
Kind: ErrKindUnknown,
|
|
Message: fmt.Sprintf("failed to parse JSON output: %v", err),
|
|
ExitCode: -1,
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// invokeString executes the pdftract binary and returns the output as a string.
|
|
func (c *Client) invokeString(ctx context.Context, args []string, source Source) (string, error) {
|
|
output, err := c.invoke(ctx, args, source)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(output), nil
|
|
}
|
|
|
|
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
|
|
func (c *Client) invokeStream(ctx context.Context, args []string, source Source) (<-chan json.RawMessage, <-chan error) {
|
|
resultChan := make(chan json.RawMessage, 16)
|
|
errChan := make(chan error, 1)
|
|
|
|
go func() {
|
|
defer close(resultChan)
|
|
defer close(errChan)
|
|
|
|
// Ensure cleanup of BytesSource temp files
|
|
if bs, ok := source.(*BytesSource); ok {
|
|
defer bs.cleanup()
|
|
}
|
|
|
|
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
|
|
|
// Set up cancellation to kill the process
|
|
if ctx.Done() != nil {
|
|
var once sync.Once
|
|
cmd.Cancel = func() error {
|
|
once.Do(func() {
|
|
if cmd.Process != nil {
|
|
cmd.Process.Kill()
|
|
}
|
|
})
|
|
return nil
|
|
}
|
|
}
|
|
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
errChan <- fmt.Errorf("failed to create stdout pipe: %w", err)
|
|
return
|
|
}
|
|
|
|
stderr := &bytes.Buffer{}
|
|
cmd.Stderr = stderr
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
errChan <- fmt.Errorf("failed to start process: %w", err)
|
|
return
|
|
}
|
|
|
|
decoder := json.NewDecoder(stdout)
|
|
for {
|
|
var raw json.RawMessage
|
|
if err := decoder.Decode(&raw); err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if ctx.Err() != nil {
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
return
|
|
}
|
|
errChan <- fmt.Errorf("failed to decode JSON: %w", err)
|
|
return
|
|
}
|
|
select {
|
|
case resultChan <- raw:
|
|
case <-ctx.Done():
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
cmd.Process.Kill()
|
|
return
|
|
}
|
|
}
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
if ctx.Err() != nil {
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
return
|
|
}
|
|
|
|
if exitErr, ok := err.(*exec.ExitError); ok {
|
|
exitCode := exitErr.ExitCode()
|
|
stderrStr := stderr.String()
|
|
errChan <- mapExitCodeToError(exitCode, stderrStr)
|
|
return
|
|
}
|
|
|
|
errChan <- fmt.Errorf("pdftract execution failed: %w", err)
|
|
return
|
|
}
|
|
}()
|
|
|
|
return resultChan, errChan
|
|
}
|