pdftract/pdftract-go/types.go
jedarden 6cc52452b3 feat(pdftract-2pyln): implement Go SDK
Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK.
All 9 contract methods exposed with context.Context-aware cancellation.

Files:
- go.mod: Module declaration with Go 1.22 minimum
- pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown,
  ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt
- types.go: Document, Page, Metadata, Fingerprint, Classification types
- errors.go: 8 error kinds with errors.As/Is support
- subprocess.go: os/exec with cmd.Cancel for context cancellation
- stream.go: Channel-based streaming (buffered to 16)
- source.go: Source interface (PathSource, URLSource, BytesSource)
- conformance_test.go: Full conformance test runner
- examples/basic/main.go: Basic usage example
- README.md: Complete documentation
- LICENSE: MIT

Acceptance criteria:
- All 9 contract methods exposed: PASS
- All 8 error kinds via errors.As: PASS
- Context cancellation terminates subprocess: PASS
- Conformance runner implemented: PASS
- pkg.go.dev will render after git tag: PASS

Verification: notes/pdftract-2pyln.md

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-20 18:47:45 -04:00

162 lines
4.2 KiB
Go

package pdftract
import "strconv"
// Document represents a PDF document with pages and metadata.
type Document struct {
SchemaVersion string `json:"schema_version"`
Pages []Page `json:"pages"`
Metadata Metadata `json:"metadata"`
}
// Page represents a single page in the document.
type Page struct {
Page int `json:"page"`
Width float64 `json:"width"`
Height float64 `json:"height"`
Rotation int `json:"rotation"`
Spans []Span `json:"spans"`
Blocks []Block `json:"blocks"`
}
// Span represents a text span with font and position information.
type Span struct {
Text string `json:"text"`
Bbox [4]float64 `json:"bbox"`
Font string `json:"font"`
Size float64 `json:"size"`
Confidence *float64 `json:"confidence"`
}
// Block represents a structural block (paragraph, heading, table, etc.).
type Block struct {
Kind string `json:"kind"`
Text string `json:"text"`
Bbox [4]float64 `json:"bbox"`
Level *int `json:"level,omitempty"`
}
// Match represents a search match result.
type Match struct {
Text string `json:"text"`
Page int `json:"page"`
Bbox [4]float64 `json:"bbox"`
Context MatchContext `json:"context"`
}
// MatchContext provides surrounding text for a match.
type MatchContext struct {
Before string `json:"before"`
After string `json:"after"`
}
// Fingerprint represents document hash information.
type Fingerprint struct {
Hash string `json:"hash"`
PageCount int `json:"page_count"`
FastHash string `json:"fast_hash"`
Metadata Metadata `json:"metadata"`
}
// Classification represents document classification results.
type Classification struct {
Category string `json:"category"`
Confidence float64 `json:"confidence"`
Tags []string `json:"tags"`
Heuristics map[string]bool `json:"heuristics"`
}
// Metadata represents document metadata.
type Metadata struct {
Title string `json:"title,omitempty"`
Author string `json:"author,omitempty"`
Subject string `json:"subject,omitempty"`
Keywords []string `json:"keywords,omitempty"`
Creator string `json:"creator,omitempty"`
Producer string `json:"producer,omitempty"`
Created *string `json:"created,omitempty"`
Modified *string `json:"modified,omitempty"`
PageCount int `json:"page_count"`
}
// Receipt represents a cryptographic receipt for document verification.
type Receipt struct {
Hash string `json:"hash"`
Signature string `json:"signature"`
Timestamp string `json:"timestamp"`
}
// ExtractOptions controls extraction behavior.
type ExtractOptions struct {
Password string
OCRLanguage string
OCRThreshold float64
PreserveLayout bool
ExtractImages bool
ImageFormat string
MinImageSize int
}
func (o *ExtractOptions) toArgs() []string {
args := []string{}
if o.Password != "" {
args = append(args, "--password", o.Password)
}
if o.OCRLanguage != "" {
args = append(args, "--ocr-language", o.OCRLanguage)
}
if o.OCRThreshold != 0 {
args = append(args, "--ocr-threshold", strconv.FormatFloat(o.OCRThreshold, 'f', -1, 64))
}
if o.PreserveLayout {
args = append(args, "--preserve-layout")
}
if o.ExtractImages {
args = append(args, "--extract-images")
}
if o.ImageFormat != "" {
args = append(args, "--image-format", o.ImageFormat)
}
if o.MinImageSize != 0 {
args = append(args, "--min-image-size", strconv.Itoa(o.MinImageSize))
}
return args
}
// SearchOptions controls search behavior.
type SearchOptions struct {
CaseInsensitive bool
Regex bool
WholeWord bool
MaxResults *int
}
func (o *SearchOptions) toArgs() []string {
args := []string{}
if o.CaseInsensitive {
args = append(args, "--case-insensitive")
}
if o.Regex {
args = append(args, "--regex")
}
if o.WholeWord {
args = append(args, "--whole-word")
}
if o.MaxResults != nil {
args = append(args, "--max-results", strconv.Itoa(*o.MaxResults))
}
return args
}
// HashOptions controls hash computation behavior.
type HashOptions struct {
Password string
}
func (o *HashOptions) toArgs() []string {
args := []string{}
if o.Password != "" {
args = append(args, "--password", o.Password)
}
return args
}