pdftract/pdftract-go/subprocess.go
jedarden 5781d67d5c fix(pdftract-2pyln): add source parameter to invoke methods for BytesSource cleanup
- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream
- Change BytesSource from []byte type to struct with data and tmpPath fields
- Add proper cleanup of temporary files after subprocess execution
- Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify

This ensures BytesSource temporary files are cleaned up after use, preventing
file descriptor leaks. The BytesSource now creates a temp file on demand and
cleans it up automatically via defer in the invoke methods.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 19:08:14 -04:00

173 lines
4 KiB
Go

package pdftract
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"os/exec"
"sync"
)
// subprocessResult holds the result of a subprocess execution.
type subprocessResult struct {
output []byte
err error
}
// invoke executes the pdftract binary with the given arguments and context.
// It returns the combined stdout/stderr output and any error that occurred.
func (c *Client) invoke(ctx context.Context, args []string, source Source) ([]byte, error) {
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process
if ctx.Done() != nil {
var once sync.Once
cmd.Cancel = func() error {
once.Do(func() {
if cmd.Process != nil {
cmd.Process.Kill()
}
})
return nil
}
}
// Ensure cleanup of BytesSource temp files
if bs, ok := source.(*BytesSource); ok {
defer bs.cleanup()
}
output, err := cmd.CombinedOutput()
if err != nil {
if ctx.Err() != nil {
return nil, fmt.Errorf("pdftract cancelled: %w", ctx.Err())
}
// Map exit codes to specific error types
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode := exitErr.ExitCode()
stderr := string(output)
return nil, mapExitCodeToError(exitCode, stderr)
}
return nil, fmt.Errorf("pdftract execution failed: %w", err)
}
return output, nil
}
// invokeJSON executes the pdftract binary and parses the output as JSON.
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}, source Source) error {
output, err := c.invoke(ctx, args, source)
if err != nil {
return err
}
if err := json.Unmarshal(output, result); err != nil {
return &PdftractError{
Kind: ErrKindUnknown,
Message: fmt.Sprintf("failed to parse JSON output: %v", err),
ExitCode: -1,
}
}
return nil
}
// invokeString executes the pdftract binary and returns the output as a string.
func (c *Client) invokeString(ctx context.Context, args []string, source Source) (string, error) {
output, err := c.invoke(ctx, args, source)
if err != nil {
return "", err
}
return string(output), nil
}
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
func (c *Client) invokeStream(ctx context.Context, args []string, source Source) (<-chan json.RawMessage, <-chan error) {
resultChan := make(chan json.RawMessage, 16)
errChan := make(chan error, 1)
go func() {
defer close(resultChan)
defer close(errChan)
// Ensure cleanup of BytesSource temp files
if bs, ok := source.(*BytesSource); ok {
defer bs.cleanup()
}
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process
if ctx.Done() != nil {
var once sync.Once
cmd.Cancel = func() error {
once.Do(func() {
if cmd.Process != nil {
cmd.Process.Kill()
}
})
return nil
}
}
stdout, err := cmd.StdoutPipe()
if err != nil {
errChan <- fmt.Errorf("failed to create stdout pipe: %w", err)
return
}
stderr := &bytes.Buffer{}
cmd.Stderr = stderr
if err := cmd.Start(); err != nil {
errChan <- fmt.Errorf("failed to start process: %w", err)
return
}
decoder := json.NewDecoder(stdout)
for {
var raw json.RawMessage
if err := decoder.Decode(&raw); err != nil {
if err == io.EOF {
break
}
if ctx.Err() != nil {
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
return
}
errChan <- fmt.Errorf("failed to decode JSON: %w", err)
return
}
select {
case resultChan <- raw:
case <-ctx.Done():
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
cmd.Process.Kill()
return
}
}
if err := cmd.Wait(); err != nil {
if ctx.Err() != nil {
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
return
}
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode := exitErr.ExitCode()
stderrStr := stderr.String()
errChan <- mapExitCodeToError(exitCode, stderrStr)
return
}
errChan <- fmt.Errorf("pdftract execution failed: %w", err)
return
}
}()
return resultChan, errChan
}