Implement the github.com/jedarden/pdftract-go Go module as a subprocess-based SDK. All 9 contract methods exposed with context.Context-aware cancellation. Files: - go.mod: Module declaration with Go 1.22 minimum - pdftract.go: Main client with Extract, ExtractText, ExtractMarkdown, ExtractStream, Search, GetMetadata, Hash, Classify, VerifyReceipt - types.go: Document, Page, Metadata, Fingerprint, Classification types - errors.go: 8 error kinds with errors.As/Is support - subprocess.go: os/exec with cmd.Cancel for context cancellation - stream.go: Channel-based streaming (buffered to 16) - source.go: Source interface (PathSource, URLSource, BytesSource) - conformance_test.go: Full conformance test runner - examples/basic/main.go: Basic usage example - README.md: Complete documentation - LICENSE: MIT Acceptance criteria: - All 9 contract methods exposed: PASS - All 8 error kinds via errors.As: PASS - Context cancellation terminates subprocess: PASS - Conformance runner implemented: PASS - pkg.go.dev will render after git tag: PASS Verification: notes/pdftract-2pyln.md Co-Authored-By: Claude Code <noreply@anthropic.com>
162 lines
4.2 KiB
Go
162 lines
4.2 KiB
Go
package pdftract
|
|
|
|
import "strconv"
|
|
|
|
// Document represents a PDF document with pages and metadata.
|
|
type Document struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
Pages []Page `json:"pages"`
|
|
Metadata Metadata `json:"metadata"`
|
|
}
|
|
|
|
// Page represents a single page in the document.
|
|
type Page struct {
|
|
Page int `json:"page"`
|
|
Width float64 `json:"width"`
|
|
Height float64 `json:"height"`
|
|
Rotation int `json:"rotation"`
|
|
Spans []Span `json:"spans"`
|
|
Blocks []Block `json:"blocks"`
|
|
}
|
|
|
|
// Span represents a text span with font and position information.
|
|
type Span struct {
|
|
Text string `json:"text"`
|
|
Bbox [4]float64 `json:"bbox"`
|
|
Font string `json:"font"`
|
|
Size float64 `json:"size"`
|
|
Confidence *float64 `json:"confidence"`
|
|
}
|
|
|
|
// Block represents a structural block (paragraph, heading, table, etc.).
|
|
type Block struct {
|
|
Kind string `json:"kind"`
|
|
Text string `json:"text"`
|
|
Bbox [4]float64 `json:"bbox"`
|
|
Level *int `json:"level,omitempty"`
|
|
}
|
|
|
|
// Match represents a search match result.
|
|
type Match struct {
|
|
Text string `json:"text"`
|
|
Page int `json:"page"`
|
|
Bbox [4]float64 `json:"bbox"`
|
|
Context MatchContext `json:"context"`
|
|
}
|
|
|
|
// MatchContext provides surrounding text for a match.
|
|
type MatchContext struct {
|
|
Before string `json:"before"`
|
|
After string `json:"after"`
|
|
}
|
|
|
|
// Fingerprint represents document hash information.
|
|
type Fingerprint struct {
|
|
Hash string `json:"hash"`
|
|
PageCount int `json:"page_count"`
|
|
FastHash string `json:"fast_hash"`
|
|
Metadata Metadata `json:"metadata"`
|
|
}
|
|
|
|
// Classification represents document classification results.
|
|
type Classification struct {
|
|
Category string `json:"category"`
|
|
Confidence float64 `json:"confidence"`
|
|
Tags []string `json:"tags"`
|
|
Heuristics map[string]bool `json:"heuristics"`
|
|
}
|
|
|
|
// Metadata represents document metadata.
|
|
type Metadata struct {
|
|
Title string `json:"title,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
Subject string `json:"subject,omitempty"`
|
|
Keywords []string `json:"keywords,omitempty"`
|
|
Creator string `json:"creator,omitempty"`
|
|
Producer string `json:"producer,omitempty"`
|
|
Created *string `json:"created,omitempty"`
|
|
Modified *string `json:"modified,omitempty"`
|
|
PageCount int `json:"page_count"`
|
|
}
|
|
|
|
// Receipt represents a cryptographic receipt for document verification.
|
|
type Receipt struct {
|
|
Hash string `json:"hash"`
|
|
Signature string `json:"signature"`
|
|
Timestamp string `json:"timestamp"`
|
|
}
|
|
|
|
// ExtractOptions controls extraction behavior.
|
|
type ExtractOptions struct {
|
|
Password string
|
|
OCRLanguage string
|
|
OCRThreshold float64
|
|
PreserveLayout bool
|
|
ExtractImages bool
|
|
ImageFormat string
|
|
MinImageSize int
|
|
}
|
|
|
|
func (o *ExtractOptions) toArgs() []string {
|
|
args := []string{}
|
|
if o.Password != "" {
|
|
args = append(args, "--password", o.Password)
|
|
}
|
|
if o.OCRLanguage != "" {
|
|
args = append(args, "--ocr-language", o.OCRLanguage)
|
|
}
|
|
if o.OCRThreshold != 0 {
|
|
args = append(args, "--ocr-threshold", strconv.FormatFloat(o.OCRThreshold, 'f', -1, 64))
|
|
}
|
|
if o.PreserveLayout {
|
|
args = append(args, "--preserve-layout")
|
|
}
|
|
if o.ExtractImages {
|
|
args = append(args, "--extract-images")
|
|
}
|
|
if o.ImageFormat != "" {
|
|
args = append(args, "--image-format", o.ImageFormat)
|
|
}
|
|
if o.MinImageSize != 0 {
|
|
args = append(args, "--min-image-size", strconv.Itoa(o.MinImageSize))
|
|
}
|
|
return args
|
|
}
|
|
|
|
// SearchOptions controls search behavior.
|
|
type SearchOptions struct {
|
|
CaseInsensitive bool
|
|
Regex bool
|
|
WholeWord bool
|
|
MaxResults *int
|
|
}
|
|
|
|
func (o *SearchOptions) toArgs() []string {
|
|
args := []string{}
|
|
if o.CaseInsensitive {
|
|
args = append(args, "--case-insensitive")
|
|
}
|
|
if o.Regex {
|
|
args = append(args, "--regex")
|
|
}
|
|
if o.WholeWord {
|
|
args = append(args, "--whole-word")
|
|
}
|
|
if o.MaxResults != nil {
|
|
args = append(args, "--max-results", strconv.Itoa(*o.MaxResults))
|
|
}
|
|
return args
|
|
}
|
|
|
|
// HashOptions controls hash computation behavior.
|
|
type HashOptions struct {
|
|
Password string
|
|
}
|
|
|
|
func (o *HashOptions) toArgs() []string {
|
|
args := []string{}
|
|
if o.Password != "" {
|
|
args = append(args, "--password", o.Password)
|
|
}
|
|
return args
|
|
}
|