pdftract/pdftract-go/stream.go
jedarden 6cc52452b3 feat(pdftract-2pyln): implement Go SDK
Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK.
All 9 contract methods exposed with context.Context-aware cancellation.

Files:
- go.mod: Module declaration with Go 1.22 minimum
- pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown,
  ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt
- types.go: Document, Page, Metadata, Fingerprint, Classification types
- errors.go: 8 error kinds with errors.As/Is support
- subprocess.go: os/exec with cmd.Cancel for context cancellation
- stream.go: Channel-based streaming (buffered to 16)
- source.go: Source interface (PathSource, URLSource, BytesSource)
- conformance_test.go: Full conformance test runner
- examples/basic/main.go: Basic usage example
- README.md: Complete documentation
- LICENSE: MIT

Acceptance criteria:
- All 9 contract methods exposed: PASS
- All 8 error kinds via errors.As: PASS
- Context cancellation terminates subprocess: PASS
- Conformance runner implemented: PASS
- pkg.go.dev will render after git tag: PASS

Verification: notes/pdftract-2pyln.md

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-20 18:47:45 -04:00

102 lines
2.1 KiB
Go

package pdftract
import (
"context"
"encoding/json"
)
// PageResult represents either a Page or an error from streaming extraction.
type PageResult struct {
Page *Page
Err error
}
// MatchResult represents either a Match or an error from streaming search.
type MatchResult struct {
Match *Match
Err error
}
// extractStream streams page results from the pdftract binary.
func (c *Client) extractStream(ctx context.Context, source Source, opts *ExtractOptions) (<-chan PageResult, error) {
args := []string{"extract", "--ndjson"}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
rawChan, errChan := c.invokeStream(ctx, args)
resultChan := make(chan PageResult, 16)
go func() {
defer close(resultChan)
for {
select {
case raw, ok := <-rawChan:
if !ok {
return
}
var page Page
if err := json.Unmarshal(raw, &page); err != nil {
resultChan <- PageResult{Err: err}
continue
}
resultChan <- PageResult{Page: &page}
case err := <-errChan:
if err != nil {
resultChan <- PageResult{Err: err}
}
return
case <-ctx.Done():
resultChan <- PageResult{Err: ctx.Err()}
return
}
}
}()
return resultChan, nil
}
// search streams match results from the pdftract binary.
func (c *Client) search(ctx context.Context, source Source, pattern string, opts *SearchOptions) (<-chan MatchResult, error) {
args := []string{"grep", pattern}
args = append(args, source.source()...)
if opts != nil {
args = append(args, opts.toArgs()...)
}
rawChan, errChan := c.invokeStream(ctx, args)
resultChan := make(chan MatchResult, 16)
go func() {
defer close(resultChan)
for {
select {
case raw, ok := <-rawChan:
if !ok {
return
}
var match Match
if err := json.Unmarshal(raw, &match); err != nil {
resultChan <- MatchResult{Err: err}
continue
}
resultChan <- MatchResult{Match: &match}
case err := <-errChan:
if err != nil {
resultChan <- MatchResult{Err: err}
}
return
case <-ctx.Done():
resultChan <- MatchResult{Err: ctx.Err()}
return
}
}
}()
return resultChan, nil
}