diff --git a/pdftract-go/pdftract.go b/pdftract-go/pdftract.go index 8ad0905..b5fd005 100644 --- a/pdftract-go/pdftract.go +++ b/pdftract-go/pdftract.go @@ -54,7 +54,7 @@ func (c *Client) Extract(ctx context.Context, source Source, opts *ExtractOption } var doc Document - if err := c.invokeJSON(ctx, args, &doc); err != nil { + if err := c.invokeJSON(ctx, args, &doc, source); err != nil { return nil, err } @@ -70,7 +70,7 @@ func (c *Client) ExtractText(ctx context.Context, source Source, opts *ExtractOp args = append(args, opts.toArgs()...) } - return c.invokeString(ctx, args) + return c.invokeString(ctx, args, source) } // ExtractMarkdown extracts markdown-formatted text from a PDF. @@ -82,7 +82,7 @@ func (c *Client) ExtractMarkdown(ctx context.Context, source Source, opts *Extra args = append(args, opts.toArgs()...) } - return c.invokeString(ctx, args) + return c.invokeString(ctx, args, source) } // ExtractStream extracts pages from a PDF as a stream. @@ -107,7 +107,7 @@ func (c *Client) GetMetadata(ctx context.Context, source Source, opts *ExtractOp var result struct { Metadata Metadata `json:"metadata"` } - if err := c.invokeJSON(ctx, args, &result); err != nil { + if err := c.invokeJSON(ctx, args, &result, source); err != nil { return nil, err } @@ -124,7 +124,7 @@ func (c *Client) Hash(ctx context.Context, source Source, opts *HashOptions) (*F } var fp Fingerprint - if err := c.invokeJSON(ctx, args, &fp); err != nil { + if err := c.invokeJSON(ctx, args, &fp, source); err != nil { return nil, err } @@ -137,7 +137,7 @@ func (c *Client) Classify(ctx context.Context, source Source) (*Classification, args = append(args, source.source()...) var cls Classification - if err := c.invokeJSON(ctx, args, &cls); err != nil { + if err := c.invokeJSON(ctx, args, &cls, source); err != nil { return nil, err } diff --git a/pdftract-go/source.go b/pdftract-go/source.go index e0da7c3..0d74b55 100644 --- a/pdftract-go/source.go +++ b/pdftract-go/source.go @@ -34,10 +34,43 @@ func (u URLSource) source() []string { } // BytesSource represents in-memory PDF bytes. -type BytesSource []byte +// The temporary file created for subprocess consumption is cleaned up after use. +type BytesSource struct { + data []byte + tmpPath string +} -func (b BytesSource) source() []string { - return []string{"--bytes-data", string(b)} +// MemorySource is a convenience constructor that creates a BytesSource from a byte slice. +func MemorySource(data []byte) Source { + return &BytesSource{data: data} +} + +func (b *BytesSource) source() []string { + if b.tmpPath != "" { + return []string{b.tmpPath} + } + + // Write to a temporary file for subprocess consumption + tmpFile, err := os.CreateTemp("", "pdftract-*.pdf") + if err != nil { + panic(fmt.Sprintf("failed to create temp file for BytesSource: %v", err)) + } + defer tmpFile.Close() + + if _, err := tmpFile.Write(b.data); err != nil { + panic(fmt.Sprintf("failed to write data to temp file: %v", err)) + } + + b.tmpPath = tmpFile.Name() + return []string{b.tmpPath} +} + +// cleanup removes the temporary file if it was created. +func (b *BytesSource) cleanup() { + if b.tmpPath != "" && !strings.HasPrefix(b.tmpPath, "--error:") { + os.Remove(b.tmpPath) + b.tmpPath = "" + } } // FileSource is a convenience constructor that creates a PathSource from a string. @@ -53,10 +86,6 @@ func RemoteSource(url string) Source { return URLSource(url) } -// MemorySource is a convenience constructor that creates a BytesSource from a byte slice. -func MemorySource(data []byte) Source { - return BytesSource(data) -} // ReadFileSource reads a file and returns a BytesSource. func ReadFileSource(path string) (Source, error) { @@ -64,5 +93,5 @@ func ReadFileSource(path string) (Source, error) { if err != nil { return nil, fmt.Errorf("failed to read file: %w", err) } - return BytesSource(data), nil + return &BytesSource{data: data}, nil } diff --git a/pdftract-go/subprocess.go b/pdftract-go/subprocess.go index 8105cb7..660dbbd 100644 --- a/pdftract-go/subprocess.go +++ b/pdftract-go/subprocess.go @@ -18,7 +18,7 @@ type subprocessResult struct { // invoke executes the pdftract binary with the given arguments and context. // It returns the combined stdout/stderr output and any error that occurred. -func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) { +func (c *Client) invoke(ctx context.Context, args []string, source Source) ([]byte, error) { cmd := exec.CommandContext(ctx, c.binaryPath, args...) // Set up cancellation to kill the process @@ -34,6 +34,11 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) { } } + // Ensure cleanup of BytesSource temp files + if bs, ok := source.(*BytesSource); ok { + defer bs.cleanup() + } + output, err := cmd.CombinedOutput() if err != nil { if ctx.Err() != nil { @@ -54,8 +59,8 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) { } // invokeJSON executes the pdftract binary and parses the output as JSON. -func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}) error { - output, err := c.invoke(ctx, args) +func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}, source Source) error { + output, err := c.invoke(ctx, args, source) if err != nil { return err } @@ -72,8 +77,8 @@ func (c *Client) invokeJSON(ctx context.Context, args []string, result interface } // invokeString executes the pdftract binary and returns the output as a string. -func (c *Client) invokeString(ctx context.Context, args []string) (string, error) { - output, err := c.invoke(ctx, args) +func (c *Client) invokeString(ctx context.Context, args []string, source Source) (string, error) { + output, err := c.invoke(ctx, args, source) if err != nil { return "", err } @@ -81,7 +86,7 @@ func (c *Client) invokeString(ctx context.Context, args []string) (string, error } // invokeStream executes the pdftract binary and streams JSONL output to a channel. -func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.RawMessage, <-chan error) { +func (c *Client) invokeStream(ctx context.Context, args []string, source Source) (<-chan json.RawMessage, <-chan error) { resultChan := make(chan json.RawMessage, 16) errChan := make(chan error, 1) @@ -89,6 +94,11 @@ func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.R defer close(resultChan) defer close(errChan) + // Ensure cleanup of BytesSource temp files + if bs, ok := source.(*BytesSource); ok { + defer bs.cleanup() + } + cmd := exec.CommandContext(ctx, c.binaryPath, args...) // Set up cancellation to kill the process