Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK. All 9 contract methods exposed with context.Context-aware cancellation. Files: - go.mod: Module declaration with Go 1.22 minimum - pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown, ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt - types.go: Document, Page, Metadata, Fingerprint, Classification types - errors.go: 8 error kinds with errors.As/Is support - subprocess.go: os/exec with cmd.Cancel for context cancellation - stream.go: Channel-based streaming (buffered to 16) - source.go: Source interface (PathSource, URLSource, BytesSource) - conformance_test.go: Full conformance test runner - examples/basic/main.go: Basic usage example - README.md: Complete documentation - LICENSE: MIT Acceptance criteria: - All 9 contract methods exposed: PASS - All 8 error kinds via errors.As: PASS - Context cancellation terminates subprocess: PASS - Conformance runner implemented: PASS - pkg.go.dev will render after git tag: PASS Verification: notes/pdftract-2pyln.md Co-Authored-By: Claude Code <noreply@anthropic.com>
163 lines
3.7 KiB
Go
163 lines
3.7 KiB
Go
package pdftract
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os/exec"
|
|
"sync"
|
|
)
|
|
|
|
// subprocessResult holds the result of a subprocess execution.
|
|
type subprocessResult struct {
|
|
output []byte
|
|
err error
|
|
}
|
|
|
|
// invoke executes the pdftract binary with the given arguments and context.
|
|
// It returns the combined stdout/stderr output and any error that occurred.
|
|
func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
|
|
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
|
|
|
// Set up cancellation to kill the process
|
|
if ctx.Done() != nil {
|
|
var once sync.Once
|
|
cmd.Cancel = func() error {
|
|
once.Do(func() {
|
|
if cmd.Process != nil {
|
|
cmd.Process.Kill()
|
|
}
|
|
})
|
|
return nil
|
|
}
|
|
}
|
|
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
if ctx.Err() != nil {
|
|
return nil, fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
}
|
|
|
|
// Map exit codes to specific error types
|
|
if exitErr, ok := err.(*exec.ExitError); ok {
|
|
exitCode := exitErr.ExitCode()
|
|
stderr := string(output)
|
|
return nil, mapExitCodeToError(exitCode, stderr)
|
|
}
|
|
|
|
return nil, fmt.Errorf("pdftract execution failed: %w", err)
|
|
}
|
|
|
|
return output, nil
|
|
}
|
|
|
|
// invokeJSON executes the pdftract binary and parses the output as JSON.
|
|
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}) error {
|
|
output, err := c.invoke(ctx, args)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := json.Unmarshal(output, result); err != nil {
|
|
return &PdftractError{
|
|
Kind: ErrKindUnknown,
|
|
Message: fmt.Sprintf("failed to parse JSON output: %v", err),
|
|
ExitCode: -1,
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// invokeString executes the pdftract binary and returns the output as a string.
|
|
func (c *Client) invokeString(ctx context.Context, args []string) (string, error) {
|
|
output, err := c.invoke(ctx, args)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(output), nil
|
|
}
|
|
|
|
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
|
|
func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.RawMessage, <-chan error) {
|
|
resultChan := make(chan json.RawMessage, 16)
|
|
errChan := make(chan error, 1)
|
|
|
|
go func() {
|
|
defer close(resultChan)
|
|
defer close(errChan)
|
|
|
|
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
|
|
|
|
// Set up cancellation to kill the process
|
|
if ctx.Done() != nil {
|
|
var once sync.Once
|
|
cmd.Cancel = func() error {
|
|
once.Do(func() {
|
|
if cmd.Process != nil {
|
|
cmd.Process.Kill()
|
|
}
|
|
})
|
|
return nil
|
|
}
|
|
}
|
|
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
errChan <- fmt.Errorf("failed to create stdout pipe: %w", err)
|
|
return
|
|
}
|
|
|
|
stderr := &bytes.Buffer{}
|
|
cmd.Stderr = stderr
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
errChan <- fmt.Errorf("failed to start process: %w", err)
|
|
return
|
|
}
|
|
|
|
decoder := json.NewDecoder(stdout)
|
|
for {
|
|
var raw json.RawMessage
|
|
if err := decoder.Decode(&raw); err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if ctx.Err() != nil {
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
return
|
|
}
|
|
errChan <- fmt.Errorf("failed to decode JSON: %w", err)
|
|
return
|
|
}
|
|
select {
|
|
case resultChan <- raw:
|
|
case <-ctx.Done():
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
cmd.Process.Kill()
|
|
return
|
|
}
|
|
}
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
if ctx.Err() != nil {
|
|
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
|
|
return
|
|
}
|
|
|
|
if exitErr, ok := err.(*exec.ExitError); ok {
|
|
exitCode := exitErr.ExitCode()
|
|
stderrStr := stderr.String()
|
|
errChan <- mapExitCodeToError(exitCode, stderrStr)
|
|
return
|
|
}
|
|
|
|
errChan <- fmt.Errorf("pdftract execution failed: %w", err)
|
|
return
|
|
}
|
|
}()
|
|
|
|
return resultChan, errChan
|
|
}
|