pdftract/pdftract-swift/Sources/Pdftract/Models/Document.swift
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

283 lines
7.1 KiB
Swift

// swiftlint:disable all
// Auto-generated from pdftract schema v1.0 - do not edit manually
import Foundation
/// PDF document with pages and metadata.
public struct Document: Codable, Sendable {
/// Schema version (e.g., "1.0")
public let schemaVersion: String
/// Pages in the document
public let pages: [Page]
/// Document metadata
public let metadata: Metadata
/// Embedded file attachments
public let attachments: [Attachment]
/// Diagnostics emitted during extraction
public let errors: [Diagnostic]
/// Extraction quality metrics
public let extractionQuality: ExtractionQuality?
/// Document outlines (bookmarks)
public let outlines: [OutlineNode]?
enum CodingKeys: String, CodingKey {
case schemaVersion = "schema_version"
case pages
case metadata
case attachments
case errors
case extractionQuality = "extraction_quality"
case outlines
}
}
/// Single page in the document.
public struct Page: Codable, Sendable {
/// Zero-based page index (canonical for programmatic use)
public let pageIndex: Int
/// One-based page number (= pageIndex + 1)
public let pageNumber: Int
/// Human-readable label from PDF /PageLabels (e.g., "iv", "A-3")
public let pageLabel: String?
/// Page width in points (1/72 inch)
public let width: Double
/// Page height in points (1/72 inch)
public let height: Double
/// Page rotation in degrees clockwise (0, 90, 180, or 270)
public let rotation: Int
/// Page classification: "text", "scanned", "mixed", "broken_vector", "blank", "figure_only"
public let type: String
/// Text spans (atomic units with consistent font and styling)
public let spans: [Span]
/// Semantic blocks (paragraphs, headings, lists, tables, etc.)
public let blocks: [Block]
/// Table structures
public let tables: [Table]
/// Page-level annotations (highlights, stamps, notes, links)
public let annotations: [Annotation]
enum CodingKeys: String, CodingKey {
case pageIndex = "page_index"
case pageNumber = "page_number"
case pageLabel = "page_label"
case width
case height
case rotation
case type
case spans
case blocks
case tables
case annotations
}
}
/// Text span with font and position information.
public struct Span: Codable, Sendable {
/// The extracted text content
public let text: String
/// Bounding box in PDF user-space points [x0, y0, x1, y1]
public let bbox: [Double]
/// Font name or identifier
public let font: String
/// Font size in points
public let size: Double
/// Fill color as CSS hex string (e.g., "#1a1a1a"), or null if not expressible as RGB
public let color: String?
/// PDF Tr operator value (0-7) indicating text rendering mode
public let renderingMode: Int?
/// Optional confidence score (0.0 to 1.0)
public let confidence: Double?
/// Source of confidence/text extraction: "native", "heuristic", "ocr"
public let confidenceSource: String?
/// BCP-47 language tag if detected (e.g., "en", "en-US", "zh-Hans")
public let lang: String?
/// Set of style flags: "bold", "italic", "smallcaps", "subscript", "superscript"
public let flags: [String]
/// Optional cryptographic receipt for verification
public let receipt: Receipt?
/// Column index (0-based) assigned by column detection
public let column: Int?
enum CodingKeys: String, CodingKey {
case text
case bbox
case font
case size
case color
case renderingMode = "rendering_mode"
case confidence
case confidenceSource = "confidence_source"
case lang
case flags
case receipt
case column
}
}
/// Structural block (paragraph, heading, list, table, figure).
public struct Block: Codable, Sendable {
/// Block kind/type: "paragraph", "heading", "list", "table", "figure"
public let kind: String
/// The concatenated text content of all spans in the block
public let text: String
/// Bounding box in PDF user-space points [x0, y0, x1, y1]
public let bbox: [Double]
/// Optional heading level (1-6) for heading blocks
public let level: Int?
/// References to spans in the page's spans array
public let spans: [Int]
/// Optional table index for table blocks
public let tableIndex: Int?
/// Optional cryptographic receipt for verification
public let receipt: Receipt?
enum CodingKeys: String, CodingKey {
case kind
case text
case bbox
case level
case spans
case tableIndex = "table_index"
case receipt
}
}
/// Match result from search operations.
public struct Match: Codable, Sendable {
/// The matched text
public let text: String
/// Page number where match occurred
public let page: Int
/// Location of the match [x0, y0, x1, y1]
public let bbox: [Double]
/// Surrounding text context (50 chars before/after)
public let context: MatchContext
}
/// Context for search matches.
public struct MatchContext: Codable, Sendable {
/// Text before the match
public let before: String
/// Text after the match
public let after: String
}
/// Fingerprint hash information.
public struct Fingerprint: Codable, Sendable {
/// SHA-256 hex of document content
public let hash: String
/// Number of pages
public let pageCount: Int
/// BLAKE3 hex of first 10KB
public let fastHash: String
/// Document metadata
public let metadata: Metadata
enum CodingKeys: String, CodingKey {
case hash
case pageCount = "page_count"
case fastHash = "fast_hash"
case metadata
}
}
/// Classification result for a document.
public struct Classification: Codable, Sendable {
/// Primary category
public let category: String
/// Confidence score (0-1)
public let confidence: Double
/// Tags associated with the document
public let tags: [String]
/// Individual feature detections
public let heuristics: [String: Bool]
}
/// Document metadata.
public struct Metadata: Codable, Sendable {
/// Document title
public let title: String?
/// Document author
public let author: String?
/// Document subject
public let subject: String?
/// Keywords
public let keywords: [String]?
/// Creator application
public let creator: String?
/// Producer application
public let producer: String?
/// Creation date (ISO 8601)
public let created: String?
/// Modification date (ISO 8601)
public let modified: String?
/// Number of pages
public let pageCount: Int
/// Whether the PDF is encrypted
public let isEncrypted: Bool?
enum CodingKeys: String, CodingKey {
case title
case author
case subject
case keywords
case creator
case producer
case created
case modified
case pageCount = "page_count"
case isEncrypted = "is_encrypted"
}
}