The indent trigger was using .abs() which fired on both increased indent (non-indented → indented) AND decreased indent (indented → non-indented). This caused drop-cap style paragraphs (indented first line, flush-left continuation) to incorrectly split into two blocks. Per plan Phase 4.4 heuristic #2, indent change should only trigger when the current line is MORE indented (to the right, larger x0) than the block average - i.e., a new paragraph starting after non-indented text. It should NOT trigger for decreased indent (first line indented, rest flush-left). Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold. Tests: - test_indented_first_line_new_block: PASS (non-indented → indented splits) - test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together) - All 179 line module tests: PASS
283 lines
7.1 KiB
Swift
283 lines
7.1 KiB
Swift
// swiftlint:disable all
|
|
// Auto-generated from pdftract schema v1.0 - do not edit manually
|
|
|
|
import Foundation
|
|
|
|
/// PDF document with pages and metadata.
|
|
public struct Document: Codable, Sendable {
|
|
/// Schema version (e.g., "1.0")
|
|
public let schemaVersion: String
|
|
|
|
/// Pages in the document
|
|
public let pages: [Page]
|
|
|
|
/// Document metadata
|
|
public let metadata: Metadata
|
|
|
|
/// Embedded file attachments
|
|
public let attachments: [Attachment]
|
|
|
|
/// Diagnostics emitted during extraction
|
|
public let errors: [Diagnostic]
|
|
|
|
/// Extraction quality metrics
|
|
public let extractionQuality: ExtractionQuality?
|
|
|
|
/// Document outlines (bookmarks)
|
|
public let outlines: [OutlineNode]?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case schemaVersion = "schema_version"
|
|
case pages
|
|
case metadata
|
|
case attachments
|
|
case errors
|
|
case extractionQuality = "extraction_quality"
|
|
case outlines
|
|
}
|
|
}
|
|
|
|
/// Single page in the document.
|
|
public struct Page: Codable, Sendable {
|
|
/// Zero-based page index (canonical for programmatic use)
|
|
public let pageIndex: Int
|
|
|
|
/// One-based page number (= pageIndex + 1)
|
|
public let pageNumber: Int
|
|
|
|
/// Human-readable label from PDF /PageLabels (e.g., "iv", "A-3")
|
|
public let pageLabel: String?
|
|
|
|
/// Page width in points (1/72 inch)
|
|
public let width: Double
|
|
|
|
/// Page height in points (1/72 inch)
|
|
public let height: Double
|
|
|
|
/// Page rotation in degrees clockwise (0, 90, 180, or 270)
|
|
public let rotation: Int
|
|
|
|
/// Page classification: "text", "scanned", "mixed", "broken_vector", "blank", "figure_only"
|
|
public let type: String
|
|
|
|
/// Text spans (atomic units with consistent font and styling)
|
|
public let spans: [Span]
|
|
|
|
/// Semantic blocks (paragraphs, headings, lists, tables, etc.)
|
|
public let blocks: [Block]
|
|
|
|
/// Table structures
|
|
public let tables: [Table]
|
|
|
|
/// Page-level annotations (highlights, stamps, notes, links)
|
|
public let annotations: [Annotation]
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case pageIndex = "page_index"
|
|
case pageNumber = "page_number"
|
|
case pageLabel = "page_label"
|
|
case width
|
|
case height
|
|
case rotation
|
|
case type
|
|
case spans
|
|
case blocks
|
|
case tables
|
|
case annotations
|
|
}
|
|
}
|
|
|
|
/// Text span with font and position information.
|
|
public struct Span: Codable, Sendable {
|
|
/// The extracted text content
|
|
public let text: String
|
|
|
|
/// Bounding box in PDF user-space points [x0, y0, x1, y1]
|
|
public let bbox: [Double]
|
|
|
|
/// Font name or identifier
|
|
public let font: String
|
|
|
|
/// Font size in points
|
|
public let size: Double
|
|
|
|
/// Fill color as CSS hex string (e.g., "#1a1a1a"), or null if not expressible as RGB
|
|
public let color: String?
|
|
|
|
/// PDF Tr operator value (0-7) indicating text rendering mode
|
|
public let renderingMode: Int?
|
|
|
|
/// Optional confidence score (0.0 to 1.0)
|
|
public let confidence: Double?
|
|
|
|
/// Source of confidence/text extraction: "native", "heuristic", "ocr"
|
|
public let confidenceSource: String?
|
|
|
|
/// BCP-47 language tag if detected (e.g., "en", "en-US", "zh-Hans")
|
|
public let lang: String?
|
|
|
|
/// Set of style flags: "bold", "italic", "smallcaps", "subscript", "superscript"
|
|
public let flags: [String]
|
|
|
|
/// Optional cryptographic receipt for verification
|
|
public let receipt: Receipt?
|
|
|
|
/// Column index (0-based) assigned by column detection
|
|
public let column: Int?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case text
|
|
case bbox
|
|
case font
|
|
case size
|
|
case color
|
|
case renderingMode = "rendering_mode"
|
|
case confidence
|
|
case confidenceSource = "confidence_source"
|
|
case lang
|
|
case flags
|
|
case receipt
|
|
case column
|
|
}
|
|
}
|
|
|
|
/// Structural block (paragraph, heading, list, table, figure).
|
|
public struct Block: Codable, Sendable {
|
|
/// Block kind/type: "paragraph", "heading", "list", "table", "figure"
|
|
public let kind: String
|
|
|
|
/// The concatenated text content of all spans in the block
|
|
public let text: String
|
|
|
|
/// Bounding box in PDF user-space points [x0, y0, x1, y1]
|
|
public let bbox: [Double]
|
|
|
|
/// Optional heading level (1-6) for heading blocks
|
|
public let level: Int?
|
|
|
|
/// References to spans in the page's spans array
|
|
public let spans: [Int]
|
|
|
|
/// Optional table index for table blocks
|
|
public let tableIndex: Int?
|
|
|
|
/// Optional cryptographic receipt for verification
|
|
public let receipt: Receipt?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case kind
|
|
case text
|
|
case bbox
|
|
case level
|
|
case spans
|
|
case tableIndex = "table_index"
|
|
case receipt
|
|
}
|
|
}
|
|
|
|
/// Match result from search operations.
|
|
public struct Match: Codable, Sendable {
|
|
/// The matched text
|
|
public let text: String
|
|
|
|
/// Page number where match occurred
|
|
public let page: Int
|
|
|
|
/// Location of the match [x0, y0, x1, y1]
|
|
public let bbox: [Double]
|
|
|
|
/// Surrounding text context (50 chars before/after)
|
|
public let context: MatchContext
|
|
}
|
|
|
|
/// Context for search matches.
|
|
public struct MatchContext: Codable, Sendable {
|
|
/// Text before the match
|
|
public let before: String
|
|
|
|
/// Text after the match
|
|
public let after: String
|
|
}
|
|
|
|
/// Fingerprint hash information.
|
|
public struct Fingerprint: Codable, Sendable {
|
|
/// SHA-256 hex of document content
|
|
public let hash: String
|
|
|
|
/// Number of pages
|
|
public let pageCount: Int
|
|
|
|
/// BLAKE3 hex of first 10KB
|
|
public let fastHash: String
|
|
|
|
/// Document metadata
|
|
public let metadata: Metadata
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case hash
|
|
case pageCount = "page_count"
|
|
case fastHash = "fast_hash"
|
|
case metadata
|
|
}
|
|
}
|
|
|
|
/// Classification result for a document.
|
|
public struct Classification: Codable, Sendable {
|
|
/// Primary category
|
|
public let category: String
|
|
|
|
/// Confidence score (0-1)
|
|
public let confidence: Double
|
|
|
|
/// Tags associated with the document
|
|
public let tags: [String]
|
|
|
|
/// Individual feature detections
|
|
public let heuristics: [String: Bool]
|
|
}
|
|
|
|
/// Document metadata.
|
|
public struct Metadata: Codable, Sendable {
|
|
/// Document title
|
|
public let title: String?
|
|
|
|
/// Document author
|
|
public let author: String?
|
|
|
|
/// Document subject
|
|
public let subject: String?
|
|
|
|
/// Keywords
|
|
public let keywords: [String]?
|
|
|
|
/// Creator application
|
|
public let creator: String?
|
|
|
|
/// Producer application
|
|
public let producer: String?
|
|
|
|
/// Creation date (ISO 8601)
|
|
public let created: String?
|
|
|
|
/// Modification date (ISO 8601)
|
|
public let modified: String?
|
|
|
|
/// Number of pages
|
|
public let pageCount: Int
|
|
|
|
/// Whether the PDF is encrypted
|
|
public let isEncrypted: Bool?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case title
|
|
case author
|
|
case subject
|
|
case keywords
|
|
case creator
|
|
case producer
|
|
case created
|
|
case modified
|
|
case pageCount = "page_count"
|
|
case isEncrypted = "is_encrypted"
|
|
}
|
|
}
|