pdftract/pdftract-go/subprocess.go
jedarden 6cc52452b3 feat(pdftract-2pyln): implement Go SDK
Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK.
All 9 contract methods exposed with context.Context-aware cancellation.

Files:
- go.mod: Module declaration with Go 1.22 minimum
- pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown,
  ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt
- types.go: Document, Page, Metadata, Fingerprint, Classification types
- errors.go: 8 error kinds with errors.As/Is support
- subprocess.go: os/exec with cmd.Cancel for context cancellation
- stream.go: Channel-based streaming (buffered to 16)
- source.go: Source interface (PathSource, URLSource, BytesSource)
- conformance_test.go: Full conformance test runner
- examples/basic/main.go: Basic usage example
- README.md: Complete documentation
- LICENSE: MIT

Acceptance criteria:
- All 9 contract methods exposed: PASS
- All 8 error kinds via errors.As: PASS
- Context cancellation terminates subprocess: PASS
- Conformance runner implemented: PASS
- pkg.go.dev will render after git tag: PASS

Verification: notes/pdftract-2pyln.md

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-20 18:47:45 -04:00

163 lines
3.7 KiB
Go

package pdftract
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"os/exec"
"sync"
)
// subprocessResult holds the result of a subprocess execution.
type subprocessResult struct {
output []byte
err error
}
// invoke executes the pdftract binary with the given arguments and context.
// It returns the combined stdout/stderr output and any error that occurred.
func (c *Client) invoke(ctx context.Context, args []string) ([]byte, error) {
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process
if ctx.Done() != nil {
var once sync.Once
cmd.Cancel = func() error {
once.Do(func() {
if cmd.Process != nil {
cmd.Process.Kill()
}
})
return nil
}
}
output, err := cmd.CombinedOutput()
if err != nil {
if ctx.Err() != nil {
return nil, fmt.Errorf("pdftract cancelled: %w", ctx.Err())
}
// Map exit codes to specific error types
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode := exitErr.ExitCode()
stderr := string(output)
return nil, mapExitCodeToError(exitCode, stderr)
}
return nil, fmt.Errorf("pdftract execution failed: %w", err)
}
return output, nil
}
// invokeJSON executes the pdftract binary and parses the output as JSON.
func (c *Client) invokeJSON(ctx context.Context, args []string, result interface{}) error {
output, err := c.invoke(ctx, args)
if err != nil {
return err
}
if err := json.Unmarshal(output, result); err != nil {
return &PdftractError{
Kind: ErrKindUnknown,
Message: fmt.Sprintf("failed to parse JSON output: %v", err),
ExitCode: -1,
}
}
return nil
}
// invokeString executes the pdftract binary and returns the output as a string.
func (c *Client) invokeString(ctx context.Context, args []string) (string, error) {
output, err := c.invoke(ctx, args)
if err != nil {
return "", err
}
return string(output), nil
}
// invokeStream executes the pdftract binary and streams JSONL output to a channel.
func (c *Client) invokeStream(ctx context.Context, args []string) (<-chan json.RawMessage, <-chan error) {
resultChan := make(chan json.RawMessage, 16)
errChan := make(chan error, 1)
go func() {
defer close(resultChan)
defer close(errChan)
cmd := exec.CommandContext(ctx, c.binaryPath, args...)
// Set up cancellation to kill the process
if ctx.Done() != nil {
var once sync.Once
cmd.Cancel = func() error {
once.Do(func() {
if cmd.Process != nil {
cmd.Process.Kill()
}
})
return nil
}
}
stdout, err := cmd.StdoutPipe()
if err != nil {
errChan <- fmt.Errorf("failed to create stdout pipe: %w", err)
return
}
stderr := &bytes.Buffer{}
cmd.Stderr = stderr
if err := cmd.Start(); err != nil {
errChan <- fmt.Errorf("failed to start process: %w", err)
return
}
decoder := json.NewDecoder(stdout)
for {
var raw json.RawMessage
if err := decoder.Decode(&raw); err != nil {
if err == io.EOF {
break
}
if ctx.Err() != nil {
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
return
}
errChan <- fmt.Errorf("failed to decode JSON: %w", err)
return
}
select {
case resultChan <- raw:
case <-ctx.Done():
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
cmd.Process.Kill()
return
}
}
if err := cmd.Wait(); err != nil {
if ctx.Err() != nil {
errChan <- fmt.Errorf("pdftract cancelled: %w", ctx.Err())
return
}
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode := exitErr.ExitCode()
stderrStr := stderr.String()
errChan <- mapExitCodeToError(exitCode, stderrStr)
return
}
errChan <- fmt.Errorf("pdftract execution failed: %w", err)
return
}
}()
return resultChan, errChan
}