Bead pdftract-5lvpu implements the Swift SDK for pdftract as a subprocess-based SDK using Foundation's Process with async/await. Targets macOS 13+ and Linux only; explicitly excludes iOS due to Apple's subprocess restrictions. Acceptance criteria status: - PASS: SPM package structure (Package.swift configured) - PASS: All 9 contract methods exposed in Methods.swift - PASS: All 8 error cases defined in Error.swift - PASS: iOS documented as unsupported in README.md - PASS: CI workflow configured (pdftract-swift-publish.yaml) - PASS: AsyncThrowingStream cancellation implemented - PASS: All model types complete (14 model files) - PASS: All options types complete (ExtractionOptions, TextOptions, etc.) - PASS: Conformance test suite defined (ConformanceTests.swift) - PASS: Cross-platform Process support (ProcessRunner actor) Files updated: - swift-sdk/README.md: Fixed GitHub URL from placeholder to jedarden/pdftract-swift Verification note: notes/pdftract-5lvpu.md References: - Plan: SDK Architecture / The Ten SDKs, line 3480 - Plan: SDK Architecture / Per-SDK Release Channels, line 3577 - Plan: SDK Acceptance Criteria, lines 3581-3589 - ADR-009: Argo Workflows on iad-ci only
196 lines
6.1 KiB
Swift
196 lines
6.1 KiB
Swift
//
|
|
// Document.swift
|
|
// Pdftract
|
|
//
|
|
// Core document model representing a fully extracted PDF.
|
|
//
|
|
|
|
import Foundation
|
|
|
|
/// Top-level output structure for PDF extraction.
|
|
///
|
|
/// This is the canonical JSON output format, containing document-level
|
|
/// metadata and an array of page objects.
|
|
public struct Document: Codable, Equatable {
|
|
/// The PDF fingerprint (for receipt generation).
|
|
public let fingerprint: String
|
|
|
|
/// Extracted pages, each containing spans and blocks.
|
|
public var pages: [Page]
|
|
|
|
/// Metadata about the extraction.
|
|
public let metadata: ExtractionMetadata
|
|
|
|
/// Digital signatures extracted from the document.
|
|
public var signatures: [Signature]
|
|
|
|
/// Interactive form fields extracted from the document.
|
|
public var formFields: [FormField]
|
|
|
|
/// Document-scoped hyperlinks extracted from the document.
|
|
public var links: [Link]
|
|
|
|
/// Embedded file attachments extracted from the document.
|
|
public var attachments: [Attachment]
|
|
|
|
/// Article thread chains extracted from the document.
|
|
public var threads: [Thread]
|
|
|
|
/// JavaScript actions detected in the document.
|
|
public var javascriptActions: [JavascriptAction]
|
|
|
|
/// Coding keys for custom serialization
|
|
enum CodingKeys: String, CodingKey {
|
|
case fingerprint
|
|
case pages
|
|
case metadata
|
|
case signatures
|
|
case formFields = "form_fields"
|
|
case links
|
|
case attachments
|
|
case threads
|
|
case javascriptActions = "javascript_actions"
|
|
}
|
|
|
|
/// Create a new Document structure.
|
|
public init(
|
|
fingerprint: String,
|
|
pages: [Page] = [],
|
|
metadata: ExtractionMetadata,
|
|
signatures: [Signature] = [],
|
|
formFields: [FormField] = [],
|
|
links: [Link] = [],
|
|
attachments: [Attachment] = [],
|
|
threads: [Thread] = [],
|
|
javascriptActions: [JavascriptAction] = []
|
|
) {
|
|
self.fingerprint = fingerprint
|
|
self.pages = pages
|
|
self.metadata = metadata
|
|
self.signatures = signatures
|
|
self.formFields = formFields
|
|
self.links = links
|
|
self.attachments = attachments
|
|
self.threads = threads
|
|
self.javascriptActions = javascriptActions
|
|
}
|
|
}
|
|
|
|
/// Metadata about the extraction process.
|
|
public struct ExtractionMetadata: Codable, Equatable {
|
|
/// Total number of pages in the document.
|
|
public let pageCount: UInt
|
|
|
|
/// Receipts mode used for this extraction.
|
|
public let receiptsMode: ReceiptsMode
|
|
|
|
/// Number of spans extracted.
|
|
public let spanCount: UInt
|
|
|
|
/// Number of blocks extracted.
|
|
public let blockCount: UInt
|
|
|
|
/// Number of pages that failed to extract.
|
|
public let errorCount: UInt
|
|
|
|
/// Diagnostics emitted during extraction (coverage warnings, etc.)
|
|
public var diagnostics: [String]
|
|
|
|
/// Cache status: "hit", "miss", or "skipped".
|
|
public var cacheStatus: String?
|
|
|
|
/// Cache entry age in seconds (only present when cache_status == "hit").
|
|
public var cacheAgeSeconds: UInt64?
|
|
|
|
/// Reading order algorithm used for this extraction.
|
|
public var readingOrderAlgorithm: String?
|
|
|
|
/// Profile name if a profile was applied (Phase 7.10).
|
|
public var profileName: String?
|
|
|
|
/// Profile version if a profile was applied (Phase 7.10).
|
|
public var profileVersion: String?
|
|
|
|
/// Extracted fields from profile if a profile was applied (Phase 7.10).
|
|
public var profileFields: [String: String]?
|
|
|
|
/// Coding keys for custom serialization
|
|
enum CodingKeys: String, CodingKey {
|
|
case pageCount = "page_count"
|
|
case receiptsMode = "receipts_mode"
|
|
case spanCount = "span_count"
|
|
case blockCount = "block_count"
|
|
case errorCount = "error_count"
|
|
case diagnostics
|
|
case cacheStatus = "cache_status"
|
|
case cacheAgeSeconds = "cache_age_seconds"
|
|
case readingOrderAlgorithm = "reading_order_algorithm"
|
|
case profileName = "profile_name"
|
|
case profileVersion = "profile_version"
|
|
case profileFields = "profile_fields"
|
|
}
|
|
|
|
/// Create a new ExtractionMetadata structure.
|
|
public init(
|
|
pageCount: UInt,
|
|
receiptsMode: ReceiptsMode,
|
|
spanCount: UInt,
|
|
blockCount: UInt,
|
|
errorCount: UInt,
|
|
diagnostics: [String] = [],
|
|
cacheStatus: String? = nil,
|
|
cacheAgeSeconds: UInt64? = nil,
|
|
readingOrderAlgorithm: String? = nil,
|
|
profileName: String? = nil,
|
|
profileVersion: String? = nil,
|
|
profileFields: [String: String]? = nil
|
|
) {
|
|
self.pageCount = pageCount
|
|
self.receiptsMode = receiptsMode
|
|
self.spanCount = spanCount
|
|
self.blockCount = blockCount
|
|
self.errorCount = errorCount
|
|
self.diagnostics = diagnostics
|
|
self.cacheStatus = cacheStatus
|
|
self.cacheAgeSeconds = cacheAgeSeconds
|
|
self.readingOrderAlgorithm = readingOrderAlgorithm
|
|
self.profileName = profileName
|
|
self.profileVersion = profileVersion
|
|
self.profileFields = profileFields
|
|
}
|
|
}
|
|
|
|
/// Receipt generation mode.
|
|
public enum ReceiptsMode: String, Codable, Equatable {
|
|
/// No receipts generated (default).
|
|
case off = "off"
|
|
/// Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.
|
|
case lite = "lite"
|
|
/// SVG mode: extended receipts that include an SVG clip rendering the glyphs.
|
|
case svg = "svg"
|
|
}
|
|
|
|
/// JavaScript action found in a PDF.
|
|
public struct JavascriptAction: Codable, Equatable {
|
|
/// Location of the JavaScript action in the PDF structure.
|
|
/// Examples: "catalog.openaction", "page.0.aa.O", "page.1.annot.0.A".
|
|
public let location: String
|
|
|
|
/// Truncated excerpt of the JavaScript code (first 200 characters).
|
|
public let codeExcerpt: String
|
|
|
|
/// Coding keys for custom serialization
|
|
enum CodingKeys: String, CodingKey {
|
|
case location
|
|
case codeExcerpt = "code_excerpt"
|
|
}
|
|
|
|
/// Create a new JavascriptAction structure.
|
|
public init(
|
|
location: String,
|
|
codeExcerpt: String
|
|
) {
|
|
self.location = location
|
|
self.codeExcerpt = codeExcerpt
|
|
}
|
|
}
|