pdftract/pdftract-swift/Sources/Pdftract/Models/Document.swift

// swiftlint:disable all
// Auto-generated from pdftract schema v1.0 - do not edit manually

import Foundation

/// PDF document with pages and metadata.
public struct Document: Codable, Sendable {
    /// Schema version (e.g., "1.0")
    public let schemaVersion: String

    /// Pages in the document
    public let pages: [Page]

    /// Document metadata
    public let metadata: Metadata

    /// Embedded file attachments
    public let attachments: [Attachment]

    /// Diagnostics emitted during extraction
    public let errors: [Diagnostic]

    /// Extraction quality metrics
    public let extractionQuality: ExtractionQuality?

    /// Document outlines (bookmarks)
    public let outlines: [OutlineNode]?

    enum CodingKeys: String, CodingKey {
        case schemaVersion = "schema_version"
        case pages
        case metadata
        case attachments
        case errors
        case extractionQuality = "extraction_quality"
        case outlines
    }
}

/// Single page in the document.
public struct Page: Codable, Sendable {
    /// Zero-based page index (canonical for programmatic use)
    public let pageIndex: Int

    /// One-based page number (= pageIndex + 1)
    public let pageNumber: Int

    /// Human-readable label from PDF /PageLabels (e.g., "iv", "A-3")
    public let pageLabel: String?

    /// Page width in points (1/72 inch)
    public let width: Double

    /// Page height in points (1/72 inch)
    public let height: Double

    /// Page rotation in degrees clockwise (0, 90, 180, or 270)
    public let rotation: Int

    /// Page classification: "text", "scanned", "mixed", "broken_vector", "blank", "figure_only"
    public let type: String

    /// Text spans (atomic units with consistent font and styling)
    public let spans: [Span]

    /// Semantic blocks (paragraphs, headings, lists, tables, etc.)
    public let blocks: [Block]

    /// Table structures
    public let tables: [Table]

    /// Page-level annotations (highlights, stamps, notes, links)
    public let annotations: [Annotation]

    enum CodingKeys: String, CodingKey {
        case pageIndex = "page_index"
        case pageNumber = "page_number"
        case pageLabel = "page_label"
        case width
        case height
        case rotation
        case type
        case spans
        case blocks
        case tables
        case annotations
    }
}

/// Text span with font and position information.
public struct Span: Codable, Sendable {
    /// The extracted text content
    public let text: String

    /// Bounding box in PDF user-space points [x0, y0, x1, y1]
    public let bbox: [Double]

    /// Font name or identifier
    public let font: String

    /// Font size in points
    public let size: Double

    /// Fill color as CSS hex string (e.g., "#1a1a1a"), or null if not expressible as RGB
    public let color: String?

    /// PDF Tr operator value (0-7) indicating text rendering mode
    public let renderingMode: Int?

    /// Optional confidence score (0.0 to 1.0)
    public let confidence: Double?

    /// Source of confidence/text extraction: "native", "heuristic", "ocr"
    public let confidenceSource: String?

    /// BCP-47 language tag if detected (e.g., "en", "en-US", "zh-Hans")
    public let lang: String?

    /// Set of style flags: "bold", "italic", "smallcaps", "subscript", "superscript"
    public let flags: [String]

    /// Optional cryptographic receipt for verification
    public let receipt: Receipt?

    /// Column index (0-based) assigned by column detection
    public let column: Int?

    enum CodingKeys: String, CodingKey {
        case text
        case bbox
        case font
        case size
        case color
        case renderingMode = "rendering_mode"
        case confidence
        case confidenceSource = "confidence_source"
        case lang
        case flags
        case receipt
        case column
    }
}

/// Structural block (paragraph, heading, list, table, figure).
public struct Block: Codable, Sendable {
    /// Block kind/type: "paragraph", "heading", "list", "table", "figure"
    public let kind: String

    /// The concatenated text content of all spans in the block
    public let text: String

    /// Bounding box in PDF user-space points [x0, y0, x1, y1]
    public let bbox: [Double]

    /// Optional heading level (1-6) for heading blocks
    public let level: Int?

    /// References to spans in the page's spans array
    public let spans: [Int]

    /// Optional table index for table blocks
    public let tableIndex: Int?

    /// Optional cryptographic receipt for verification
    public let receipt: Receipt?

    enum CodingKeys: String, CodingKey {
        case kind
        case text
        case bbox
        case level
        case spans
        case tableIndex = "table_index"
        case receipt
    }
}

/// Match result from search operations.
public struct Match: Codable, Sendable {
    /// The matched text
    public let text: String

    /// Page number where match occurred
    public let page: Int

    /// Location of the match [x0, y0, x1, y1]
    public let bbox: [Double]

    /// Surrounding text context (50 chars before/after)
    public let context: MatchContext
}

/// Context for search matches.
public struct MatchContext: Codable, Sendable {
    /// Text before the match
    public let before: String

    /// Text after the match
    public let after: String
}

/// Fingerprint hash information.
public struct Fingerprint: Codable, Sendable {
    /// SHA-256 hex of document content
    public let hash: String

    /// Number of pages
    public let pageCount: Int

    /// BLAKE3 hex of first 10KB
    public let fastHash: String

    /// Document metadata
    public let metadata: Metadata

    enum CodingKeys: String, CodingKey {
        case hash
        case pageCount = "page_count"
        case fastHash = "fast_hash"
        case metadata
    }
}

/// Classification result for a document.
public struct Classification: Codable, Sendable {
    /// Primary category
    public let category: String

    /// Confidence score (0-1)
    public let confidence: Double

    /// Tags associated with the document
    public let tags: [String]

    /// Individual feature detections
    public let heuristics: [String: Bool]
}

/// Document metadata.
public struct Metadata: Codable, Sendable {
    /// Document title
    public let title: String?

    /// Document author
    public let author: String?

    /// Document subject
    public let subject: String?

    /// Keywords
    public let keywords: [String]?

    /// Creator application
    public let creator: String?

    /// Producer application
    public let producer: String?

    /// Creation date (ISO 8601)
    public let created: String?

    /// Modification date (ISO 8601)
    public let modified: String?

    /// Number of pages
    public let pageCount: Int

    /// Whether the PDF is encrypted
    public let isEncrypted: Bool?

    enum CodingKeys: String, CodingKey {
        case title
        case author
        case subject
        case keywords
        case creator
        case producer
        case created
        case modified
        case pageCount = "page_count"
        case isEncrypted = "is_encrypted"
    }
}