Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK. All 9 contract methods exposed with context.Context-aware cancellation. Files: - go.mod: Module declaration with Go 1.22 minimum - pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown, ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt - types.go: Document, Page, Metadata, Fingerprint, Classification types - errors.go: 8 error kinds with errors.As/Is support - subprocess.go: os/exec with cmd.Cancel for context cancellation - stream.go: Channel-based streaming (buffered to 16) - source.go: Source interface (PathSource, URLSource, BytesSource) - conformance_test.go: Full conformance test runner - examples/basic/main.go: Basic usage example - README.md: Complete documentation - LICENSE: MIT Acceptance criteria: - All 9 contract methods exposed: PASS - All 8 error kinds via errors.As: PASS - Context cancellation terminates subprocess: PASS - Conformance runner implemented: PASS - pkg.go.dev will render after git tag: PASS Verification: notes/pdftract-2pyln.md Co-Authored-By: Claude Code <noreply@anthropic.com>
102 lines
2.1 KiB
Go
102 lines
2.1 KiB
Go
package pdftract
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
)
|
|
|
|
// PageResult represents either a Page or an error from streaming extraction.
|
|
type PageResult struct {
|
|
Page *Page
|
|
Err error
|
|
}
|
|
|
|
// MatchResult represents either a Match or an error from streaming search.
|
|
type MatchResult struct {
|
|
Match *Match
|
|
Err error
|
|
}
|
|
|
|
// extractStream streams page results from the pdftract binary.
|
|
func (c *Client) extractStream(ctx context.Context, source Source, opts *ExtractOptions) (<-chan PageResult, error) {
|
|
args := []string{"extract", "--ndjson"}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
rawChan, errChan := c.invokeStream(ctx, args)
|
|
resultChan := make(chan PageResult, 16)
|
|
|
|
go func() {
|
|
defer close(resultChan)
|
|
|
|
for {
|
|
select {
|
|
case raw, ok := <-rawChan:
|
|
if !ok {
|
|
return
|
|
}
|
|
var page Page
|
|
if err := json.Unmarshal(raw, &page); err != nil {
|
|
resultChan <- PageResult{Err: err}
|
|
continue
|
|
}
|
|
resultChan <- PageResult{Page: &page}
|
|
case err := <-errChan:
|
|
if err != nil {
|
|
resultChan <- PageResult{Err: err}
|
|
}
|
|
return
|
|
case <-ctx.Done():
|
|
resultChan <- PageResult{Err: ctx.Err()}
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
return resultChan, nil
|
|
}
|
|
|
|
// search streams match results from the pdftract binary.
|
|
func (c *Client) search(ctx context.Context, source Source, pattern string, opts *SearchOptions) (<-chan MatchResult, error) {
|
|
args := []string{"grep", pattern}
|
|
args = append(args, source.source()...)
|
|
|
|
if opts != nil {
|
|
args = append(args, opts.toArgs()...)
|
|
}
|
|
|
|
rawChan, errChan := c.invokeStream(ctx, args)
|
|
resultChan := make(chan MatchResult, 16)
|
|
|
|
go func() {
|
|
defer close(resultChan)
|
|
|
|
for {
|
|
select {
|
|
case raw, ok := <-rawChan:
|
|
if !ok {
|
|
return
|
|
}
|
|
var match Match
|
|
if err := json.Unmarshal(raw, &match); err != nil {
|
|
resultChan <- MatchResult{Err: err}
|
|
continue
|
|
}
|
|
resultChan <- MatchResult{Match: &match}
|
|
case err := <-errChan:
|
|
if err != nil {
|
|
resultChan <- MatchResult{Err: err}
|
|
}
|
|
return
|
|
case <-ctx.Done():
|
|
resultChan <- MatchResult{Err: ctx.Err()}
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
return resultChan, nil
|
|
}
|