fix(pdftract-2pyln): add source parameter to invoke methods for BytesSource cleanup
- Add source Source parameter to invoke, invokeJSON, invokeString, invokeStream - Change BytesSource from []byte type to struct with data and tmpPath fields - Add proper cleanup of temporary files after subprocess execution - Fix source parameter pass-through in Extract, ExtractText, ExtractMarkdown, GetMetadata, Hash, Classify This ensures BytesSource temporary files are cleaned up after use, preventing file descriptor leaks. The BytesSource now creates a temp file on demand and cleans it up automatically via defer in the invoke methods. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
e0dea12849
commit
5781d67d5c
3 changed files with 59 additions and 20 deletions
|
|
@ -54,7 +54,7 @@ func (c *Client) Extract(ctx context.Context, source Source, opts *ExtractOption
|
|||
}
|
||||
|
||||
var doc Document
|
||||
if err := c.invokeJSON(ctx, args, &doc); err != nil {
|
||||
if err := c.invokeJSON(ctx, args, &doc, source); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
@ -70,7 +70,7 @@ func (c *Client) ExtractText(ctx context.Context, source Source, opts *ExtractOp
|
|||
args = append(args, opts.toArgs()...)
|
||||
}
|
||||
|
||||
return c.invokeString(ctx, args)
|
||||
return c.invokeString(ctx, args, source)
|
||||
}
|
||||
|
||||
// ExtractMarkdown extracts markdown-formatted text from a PDF.
|
||||
|
|
@ -82,7 +82,7 @@ func (c *Client) ExtractMarkdown(ctx context.Context, source Source, opts *Extra
|
|||
args = append(args, opts.toArgs()...)
|
||||
}
|
||||
|
||||
return c.invokeString(ctx, args)
|
||||
return c.invokeString(ctx, args, source)
|
||||
}
|
||||
|
||||
// ExtractStream extracts pages from a PDF as a stream.
|
||||
|
|
@ -107,7 +107,7 @@ func (c *Client) GetMetadata(ctx context.Context, source Source, opts *ExtractOp
|
|||
var result struct {
|
||||
Metadata Metadata `json:"metadata"`
|
||||
}
|
||||
if err := c.invokeJSON(ctx, args, &result); err != nil {
|
||||
if err := c.invokeJSON(ctx, args, &result, source); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
@ -124,7 +124,7 @@ func (c *Client) Hash(ctx context.Context, source Source, opts *HashOptions) (*F
|
|||
}
|
||||
|
||||
var fp Fingerprint
|
||||
if err := c.invokeJSON(ctx, args, &fp); err != nil {
|
||||
if err := c.invokeJSON(ctx, args, &fp, source); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
@ -137,7 +137,7 @@ func (c *Client) Classify(ctx context.Context, source Source) (*Classification,
|
|||
args = append(args, source.source()...)
|
||||
|
||||
var cls Classification
|
||||
if err := c.invokeJSON(ctx, args, &cls); err != nil {
|
||||
if err := c.invokeJSON(ctx, args, &cls, source); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,10 +34,43 @@ func (u URLSource) source() []string {
|
|||
}
|
||||
|
||||
// BytesSource represents in-memory PDF bytes.
|
||||
type BytesSource []byte
|
||||
// The temporary file created for subprocess consumption is cleaned up after use.
|
||||
type BytesSource struct {
|
||||
data []byte
|
||||
tmpPath string
|
||||
}
|
||||
|
||||
func (b BytesSource) source() []string {
|
||||
return []string{"--bytes-data", string(b)}
|
||||
// MemorySource is a convenience constructor that creates a BytesSource from a byte slice.
|
||||
func MemorySource(data []byte) Source {
|
||||
return &BytesSource{data: data}
|
||||
}
|
||||
|
||||
func (b *BytesSource) source() []string {
|
||||
if b.tmpPath != "" {
|
||||
return []string{b.tmpPath}
|
||||
}
|
||||
|
||||
// Write to a temporary file for subprocess consumption
|
||||
tmpFile, err := os.CreateTemp("", "pdftract-*.pdf")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("failed to create temp file for BytesSource: %v", err))
|
||||
}
|
||||
defer tmpFile.Close()
|
||||
|
||||
if _, err := tmpFile.Write(b.data); err != nil {
|
||||
panic(fmt.Sprintf("failed to write data to temp file: %v", err))
|
||||
}
|
||||
|
||||
b.tmpPath = tmpFile.Name()
|
||||
return []string{b.tmpPath}
|
||||
}
|
||||
|
||||
// cleanup removes the temporary file if it was created.
|
||||
func (b *BytesSource) cleanup() {
|
||||
if b.tmpPath != "" && !strings.HasPrefix(b.tmpPath, "--error:") {
|
||||
os.Remove(b.tmpPath)
|
||||
b.tmpPath = ""
|
||||
}
|
||||
}
|
||||
|
||||
// FileSource is a convenience constructor that creates a PathSource from a string.
|
||||
|
|
@ -53,10 +86,6 @@ func RemoteSource(url string) Source {
|
|||
return URLSource(url)
|
||||
}
|
||||
|
||||
// MemorySource is a convenience constructor that creates a BytesSource from a byte slice.
|
||||
func MemorySource(data []byte) Source {
|
||||
return BytesSource(data)
|
||||
}
|
||||
|
||||
// ReadFileSource reads a file and returns a BytesSource.
|
||||
func ReadFileSource(path string) (Source, error) {
|
||||
|
|
@ -64,5 +93,5 @@ func ReadFileSource(path string) (Source, error) {
|
|||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read file: %w", err)
|
||||
}
|
||||
return BytesSource(data), nil
|
||||
return &BytesSource{data: data}, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ type subprocessResult struct {
|
|||
|
||||
// invoke executes the pdftract binary with the given arguments and context.
|
||||
// It returns the combined stdout/stderr output and any error that occurred.
|
||||
func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
|
||||
func (c *Client) invoke(ctx context.Context, args []string, source Source) ([]byte, error) {
|
||||
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
||||
|
||||
// Set up cancellation to kill the process
|
||||
|
|
@ -34,6 +34,11 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
|
|||
}
|
||||
}
|
||||
|
||||
// Ensure cleanup of BytesSource temp files
|
||||
if bs, ok := source.(*BytesSource); ok {
|
||||
defer bs.cleanup()
|
||||
}
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
|
|
@ -54,8 +59,8 @@ func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
|
|||
}
|
||||
|
||||
// invokeJSON executes the pdftract binary and parses the output as JSON.
|
||||
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}) error {
|
||||
output, err := c.invoke(ctx, args)
|
||||
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}, source Source) error {
|
||||
output, err := c.invoke(ctx, args, source)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
@ -72,8 +77,8 @@ func (c *Client) invokeJSON(ctx context.Context, args []string, result interface
|
|||
}
|
||||
|
||||
// invokeString executes the pdftract binary and returns the output as a string.
|
||||
func (c *Client) invokeString(ctx context.Context, args []string) (string, error) {
|
||||
output, err := c.invoke(ctx, args)
|
||||
func (c *Client) invokeString(ctx context.Context, args []string, source Source) (string, error) {
|
||||
output, err := c.invoke(ctx, args, source)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
|
@ -81,7 +86,7 @@ func (c *Client) invokeString(ctx context.Context, args []string) (string, error
|
|||
}
|
||||
|
||||
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
|
||||
func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.RawMessage, <-chan error) {
|
||||
func (c *Client) invokeStream(ctx context.Context, args []string, source Source) (<-chan json.RawMessage, <-chan error) {
|
||||
resultChan := make(chan json.RawMessage, 16)
|
||||
errChan := make(chan error, 1)
|
||||
|
||||
|
|
@ -89,6 +94,11 @@ func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.R
|
|||
defer close(resultChan)
|
||||
defer close(errChan)
|
||||
|
||||
// Ensure cleanup of BytesSource temp files
|
||||
if bs, ok := source.(*BytesSource); ok {
|
||||
defer bs.cleanup()
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
||||
|
||||
// Set up cancellation to kill the process
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue