fix(pdftract-2pyln): add source parameter to invoke methods for BytesSource cleanup

- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream
- Change BytesSource from []byte type to struct with data and tmpPath fields
- Add proper cleanup of temporary files after subprocess execution
- Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify

This ensures BytesSource temporary files are cleaned up after use, preventing
file descriptor leaks. The BytesSource now creates a temp file on demand and
cleans it up automatically via defer in the invoke methods.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-20 19:08:14 -04:00
parent e0dea12849
commit 5781d67d5c
3 changed files with 59 additions and 20 deletions

View file

@ -54,7 +54,7 @@ func (c *Client) Extract(ctx context.Context, source Source, opts *ExtractOption
}
var doc Document
if err := c.invokeJSON(ctx, args, &doc); err != nil {
if err := c.invokeJSON(ctx, args, &doc, source); err != nil {
return nil, err
}
@ -70,7 +70,7 @@ func (c *Client) ExtractText(ctx context.Context, source Source, opts *ExtractOp
args = append(args, opts.toArgs()...)
}
return c.invokeString(ctx, args)
return c.invokeString(ctx, args, source)
}
// ExtractMarkdown extracts markdown-formatted text from a PDF.
@ -82,7 +82,7 @@ func (c *Client) ExtractMarkdown(ctx context.Context, source Source, opts *Extra
args = append(args, opts.toArgs()...)
}
return c.invokeString(ctx, args)
return c.invokeString(ctx, args, source)
}
// ExtractStream extracts pages from a PDF as a stream.
@ -107,7 +107,7 @@ func (c *Client) GetMetadata(ctx context.Context, source Source, opts *ExtractOp
var result struct {
Metadata Metadata `json:"metadata"`
}
if err := c.invokeJSON(ctx, args, &result); err != nil {
if err := c.invokeJSON(ctx, args, &result, source); err != nil {
return nil, err
}
@ -124,7 +124,7 @@ func (c *Client) Hash(ctx context.Context, source Source, opts *HashOptions) (*F
}
var fp Fingerprint
if err := c.invokeJSON(ctx, args, &fp); err != nil {
if err := c.invokeJSON(ctx, args, &fp, source); err != nil {
return nil, err
}
@ -137,7 +137,7 @@ func (c *Client) Classify(ctx context.Context, source Source) (*Classification,
args = append(args, source.source()...)
var cls Classification
if err := c.invokeJSON(ctx, args, &cls); err != nil {
if err := c.invokeJSON(ctx, args, &cls, source); err != nil {
return nil, err
}

View file

@ -34,10 +34,43 @@ func (u URLSource) source() []string {
}
// BytesSource represents in-memory PDF bytes.
type BytesSource []byte
// The temporary file created for subprocess consumption is cleaned up after use.
type BytesSource struct {
data []byte
tmpPath string
}
func (b BytesSource) source() []string {
return []string{"--bytes-data", string(b)}
// MemorySource is a convenience constructor that creates a BytesSource from a byte slice.
func MemorySource(data []byte) Source {
return &BytesSource{data: data}
}
func (b *BytesSource) source() []string {
if b.tmpPath != "" {
return []string{b.tmpPath}
}
// Write to a temporary file for subprocess consumption
tmpFile, err := os.CreateTemp("", "pdftract-*.pdf")
if err != nil {
panic(fmt.Sprintf("failed to create temp file for BytesSource: %v", err))
}
defer tmpFile.Close()
if _, err := tmpFile.Write(b.data); err != nil {
panic(fmt.Sprintf("failed to write data to temp file: %v", err))
}
b.tmpPath = tmpFile.Name()
return []string{b.tmpPath}
}
// cleanup removes the temporary file if it was created.
func (b *BytesSource) cleanup() {
if b.tmpPath != "" && !strings.HasPrefix(b.tmpPath, "--error:") {
os.Remove(b.tmpPath)
b.tmpPath = ""
}
}
// FileSource is a convenience constructor that creates a PathSource from a string.
@ -53,10 +86,6 @@ func RemoteSource(url string) Source {
return URLSource(url)
}
// MemorySource is a convenience constructor that creates a BytesSource from a byte slice.
func MemorySource(data []byte) Source {
return BytesSource(data)
}
// ReadFileSource reads a file and returns a BytesSource.
func ReadFileSource(path string) (Source, error) {
@ -64,5 +93,5 @@ func ReadFileSource(path string) (Source, error) {
if err != nil {
return nil, fmt.Errorf("failed to read file: %w", err)
}
return BytesSource(data), nil
return &BytesSource{data: data}, nil
}

View file

@ -18,7 +18,7 @@ type subprocessResult struct {
// invoke executes the pdftract binary with the given arguments and context.
// It returns the combined stdout/stderr output and any error that occurred.
func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
func (c *Client) invoke(ctx context.Context, args []string, source Source) ([]byte, error) {
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process
@ -34,6 +34,11 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
}
}
// Ensure cleanup of BytesSource temp files
if bs, ok := source.(*BytesSource); ok {
defer bs.cleanup()
}
output, err := cmd.CombinedOutput()
if err != nil {
if ctx.Err() != nil {
@ -54,8 +59,8 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
}
// invokeJSON executes the pdftract binary and parses the output as JSON.
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}) error {
output, err := c.invoke(ctx, args)
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}, source Source) error {
output, err := c.invoke(ctx, args, source)
if err != nil {
return err
}
@ -72,8 +77,8 @@ func (c *Client) invokeJSON(ctx context.Context, args []string, result interface
}
// invokeString executes the pdftract binary and returns the output as a string.
func (c *Client) invokeString(ctx context.Context, args []string) (string, error) {
output, err := c.invoke(ctx, args)
func (c *Client) invokeString(ctx context.Context, args []string, source Source) (string, error) {
output, err := c.invoke(ctx, args, source)
if err != nil {
return "", err
}
@ -81,7 +86,7 @@ func (c *Client) invokeString(ctx context.Context, args []string) (string, error
}
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.RawMessage, <-chan error) {
func (c *Client) invokeStream(ctx context.Context, args []string, source Source) (<-chan json.RawMessage, <-chan error) {
resultChan := make(chan json.RawMessage, 16)
errChan := make(chan error, 1)
@ -89,6 +94,11 @@ func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.R
defer close(resultChan)
defer close(errChan)
// Ensure cleanup of BytesSource temp files
if bs, ok := source.(*BytesSource); ok {
defer bs.cleanup()
}
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process