Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK. All 9 contract methods exposed with context.Context-aware cancellation. Files: - go.mod: Module declaration with Go 1.22 minimum - pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown, ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt - types.go: Document, Page, Metadata, Fingerprint, Classification types - errors.go: 8 error kinds with errors.As/Is support - subprocess.go: os/exec with cmd.Cancel for context cancellation - stream.go: Channel-based streaming (buffered to 16) - source.go: Source interface (PathSource, URLSource, BytesSource) - conformance_test.go: Full conformance test runner - examples/basic/main.go: Basic usage example - README.md: Complete documentation - LICENSE: MIT Acceptance criteria: - All 9 contract methods exposed: PASS - All 8 error kinds via errors.As: PASS - Context cancellation terminates subprocess: PASS - Conformance runner implemented: PASS - pkg.go.dev will render after git tag: PASS Verification: notes/pdftract-2pyln.md Co-Authored-By: Claude Code <noreply@anthropic.com>
56 lines
1.2 KiB
Go
56 lines
1.2 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
|
|
"github.com/jedarden/pdftract-go"
|
|
)
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintf(os.Stderr, "Usage: %s <pdf-file>\n", os.Args[0])
|
|
os.Exit(1)
|
|
}
|
|
|
|
// Create a client (searches PATH for pdftract binary)
|
|
client, err := pdftract.NewClient("")
|
|
if err != nil {
|
|
log.Fatalf("Failed to create client: %v", err)
|
|
}
|
|
|
|
ctx := context.Background()
|
|
source := pdftract.FileSource(os.Args[1])
|
|
|
|
// Extract metadata
|
|
meta, err := client.GetMetadata(ctx, source, nil)
|
|
if err != nil {
|
|
log.Fatalf("Failed to get metadata: %v", err)
|
|
}
|
|
|
|
fmt.Printf("Title: %s\n", meta.Title)
|
|
fmt.Printf("Author: %s\n", meta.Author)
|
|
fmt.Printf("Page count: %d\n", meta.PageCount)
|
|
|
|
// Extract full document
|
|
doc, err := client.Extract(ctx, source, &pdftract.ExtractOptions{
|
|
OCRLanguage: "eng",
|
|
OCRThreshold: 0.7,
|
|
})
|
|
if err != nil {
|
|
log.Fatalf("Failed to extract: %v", err)
|
|
}
|
|
|
|
fmt.Printf("Schema version: %s\n", doc.SchemaVersion)
|
|
fmt.Printf("Pages: %d\n", len(doc.Pages))
|
|
|
|
// Print first page info
|
|
if len(doc.Pages) > 0 {
|
|
page := doc.Pages[0]
|
|
fmt.Printf("Page 1: %dx%d, rotation=%d\n",
|
|
int(page.Width), int(page.Height), page.Rotation)
|
|
fmt.Printf(" Spans: %d, Blocks: %d\n", len(page.Spans), len(page.Blocks))
|
|
}
|
|
}
|