pdftract/pdftract-swift/Sources/PdftractCodegen/Methods.swift
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

557 lines
18 KiB
Swift

//
// This file is auto-generated. Do not edit manually.
//
#if os(Linux)
import Foundation
#else
import Foundation
#endif
/// Main Pdftract client for extracting data from PDFs.
/// Uses the bundled pdftract binary via Process spawning.
public struct Pdftract {
private let binaryPath: String
/// Creates a new Pdftract client.
/// - Parameter binaryPath: Path to the pdftract binary. If nil, searches PATH.
public init(binaryPath: String? = nil) {
if let binaryPath = binaryPath {
self.binaryPath = binaryPath
} else {
// Search PATH for pdftract
self.binaryPath = Self.findBinary() ?? "pdftract"
}
}
/// Finds the pdftract binary on PATH.
private static func findBinary() -> String? {
#if os(Linux)
let envPath = ProcessInfo.processInfo.environment["PATH"] ?? ""
let paths = envPath.split(separator: ":")
#else
let envPath = ProcessInfo.processInfo.environment["PATH"] ?? ""
let paths = envPath.split(separator: ";")
#endif
for path in paths {
let binaryPath = NSString.path(withComponents: [String(path), "pdftract"])
if FileManager.default.fileExists(atPath: binaryPath) {
return binaryPath
}
}
return nil
}
/// Executes the pdftract binary with the given arguments.
/// - Parameter args: Command-line arguments to pass.
/// - Returns: The stdout output as a String.
/// - Throws: `PdftractError` if the process fails.
private func exec(_ args: [String]) async throws -> String {
let process = Process()
process.executableURL = URL(fileURLWithPath: binaryPath)
let outPipe = Pipe()
let errPipe = Pipe()
process.standardOutput = outPipe
process.standardError = errPipe
process.arguments = args
do {
try process.run()
process.waitUntilExit()
let outData = outPipe.fileHandleForReading.readDataToEndOfFile()
let errData = errPipe.fileHandleForReading.readDataToEndOfFile()
let output = String(data: outData, encoding: .utf8) ?? ""
let stderr = String(data: errData, encoding: .utf8) ?? ""
guard process.terminationStatus == 0 else {
throw mapError(stderr, Int(process.terminationStatus))
}
return output
} catch let error as PdftractError {
throw error
} catch {
throw PdftractError("Failed to execute pdftract: \(error.localizedDescription)", -1)
}
}
/// Maps CLI exit codes to Swift errors.
/// - Parameters:
/// - stderr: The stderr output from the process.
/// - exitCode: The exit code.
/// - Returns: A `PdftractError` subclass.
private func mapError(_ stderr: String, _ exitCode: Int) -> PdftractError {
guard let exitCode = exitCode else {
return PdftractError(stderr, -1)
}
switch exitCode {
case 2:
return CorruptPdfError(stderr, exitCode)
case 3:
return EncryptionError(stderr, exitCode)
case 4:
return SourceUnreachableError(stderr, exitCode)
case 5:
return RemoteFetchInterruptedError(stderr, exitCode)
case 6:
return TlsError(stderr, exitCode)
case 10:
return ReceiptVerifyError(stderr, exitCode)
default:
return PdftractError(stderr, exitCode)
}
}
/// Extracts structured data from a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Extraction options.
/// - Returns: The complete document structure.
/// - Throws: `PdftractError` if extraction fails.
public func extract(
_ source: Source,
options: ExtractOptions = ExtractOptions()
) async throws -> Document {
var args = ["extract", "--json"]
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
let output = try await exec(args)
guard let data = output.data(using: .utf8) else {
throw PdftractError("Failed to decode output", -1)
}
return try JSONDecoder().decode(Document.self, from: data)
}
/// Extracts plain text from a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Extraction options.
/// - Returns: The extracted text.
/// - Throws: `PdftractError` if extraction fails.
public func extractText(
_ source: Source,
options: ExtractOptions = ExtractOptions()
) async throws -> String {
var args = ["extract"]
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
args.append("--text")
args.append("--json")
let output = try await exec(args)
// Parse JSON to verify it's valid, then extract the text field
guard let data = output.data(using: .utf8),
let doc = try? JSONDecoder().decode(Document.self, from: data) else {
throw PdftractError("Failed to decode JSON output", -1)
}
// Return concatenated page text
return doc.pages.map { page in
page.blocks.map { $0.text }.joined(separator: "\n")
}.joined(separator: "\n\n")
}
/// Extracts Markdown-formatted text from a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Extraction options.
/// - Returns: The extracted text.
/// - Throws: `PdftractError` if extraction fails.
public func extractMarkdown(
_ source: Source,
options: ExtractOptions = ExtractOptions()
) async throws -> String {
var args = ["extract"]
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
args.append("--md")
args.append("--json")
let output = try await exec(args)
// Parse JSON to verify it's valid, then extract the text field
guard let data = output.data(using: .utf8),
let doc = try? JSONDecoder().decode(Document.self, from: data) else {
throw PdftractError("Failed to decode JSON output", -1)
}
// Return concatenated page text
return doc.pages.map { page in
page.blocks.map { $0.text }.joined(separator: "\n")
}.joined(separator: "\n\n")
}
/// Extracts pages from a PDF as an async stream.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Extraction options.
/// - Returns: An `AsyncThrowingStream` that yields `Page` values.
/// - Throws: `PdftractError` if extraction fails.
public func extractStream(
_ source: Source,
options: ExtractOptions = ExtractOptions()
) -> AsyncThrowingStream<Page, Error> {
return AsyncThrowingStream { continuation in
Task {
var args = ["extract", "--ndjson"]
do {
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
} catch {
continuation.finish(throwing: error)
return
}
let process = Process()
process.executableURL = URL(fileURLWithPath: binaryPath)
let outPipe = Pipe()
let errPipe = Pipe()
process.standardOutput = outPipe
process.standardError = errPipe
process.arguments = args
// Handle cancellation
continuation.onTermination = { @Sendable _ in
process.terminate()
_ = try? process.waitUntilExit()
}
do {
try process.run()
let outHandle = outPipe.fileHandleForReading
let errHandle = errPipe.fileHandleForReading
// Read lines incrementally
var buffer = [UInt8]()
let readSize = 4096
while process.isRunning {
let data = outHandle.readData(ofLength: readSize)
if data.isEmpty {
break
}
buffer.append(contentsOf: data)
// Process complete lines
while let newlineIndex = buffer.firstIndex(of: 0x0A) {
let lineData = Data(buffer[..<newlineIndex])
buffer.removeSubrange(0...newlineIndex)
if let lineString = String(data: lineData, encoding: .utf8), !lineString.isEmpty {
do {
let page = try JSONDecoder().decode(Page.self, from: lineData)
continuation.yield(page)
} catch {
// Skip malformed lines; the final error will be reported if needed
}
}
}
}
// Process remaining buffer
if !buffer.isEmpty {
if let lineString = String(data: buffer, encoding: .utf8), !lineString.isEmpty {
do {
let page = try JSONDecoder().decode(Page.self, from: Data(buffer))
continuation.yield(page)
} catch {
// Skip malformed lines
}
}
}
process.waitUntilExit()
if process.terminationStatus != 0 {
let errData = errHandle.readDataToEndOfFile()
let stderr = String(data: errData, encoding: .utf8) ?? ""
continuation.finish(throwing: mapError(stderr, Int(process.terminationStatus)))
} else {
continuation.finish()
}
} catch {
continuation.finish(throwing: error)
}
}
}
}
/// Searches for text in a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - pattern: The text pattern to search for.
/// - options: Search options.
/// - Returns: An `AsyncThrowingStream` that yields `Match` values.
/// - Throws: `PdftractError` if search fails.
public func search(
_ source: Source,
_ pattern: String,
options: SearchOptions = SearchOptions()
) -> AsyncThrowingStream<Match, Error> {
return AsyncThrowingStream { continuation in
Task {
var args = ["grep", pattern]
do {
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
} catch {
continuation.finish(throwing: error)
return
}
let process = Process()
process.executableURL = URL(fileURLWithPath: binaryPath)
let outPipe = Pipe()
let errPipe = Pipe()
process.standardOutput = outPipe
process.standardError = errPipe
process.arguments = args
// Handle cancellation
continuation.onTermination = { @Sendable _ in
process.terminate()
_ = try? process.waitUntilExit()
}
do {
try process.run()
let outHandle = outPipe.fileHandleForReading
let errHandle = errPipe.fileHandleForReading
// Read lines incrementally
var buffer = [UInt8]()
let readSize = 4096
while process.isRunning {
let data = outHandle.readData(ofLength: readSize)
if data.isEmpty {
break
}
buffer.append(contentsOf: data)
// Process complete lines
while let newlineIndex = buffer.firstIndex(of: 0x0A) {
let lineData = Data(buffer[..<newlineIndex])
buffer.removeSubrange(0...newlineIndex)
if let lineString = String(data: lineData, encoding: .utf8), !lineString.isEmpty {
do {
let match = try JSONDecoder().decode(Match.self, from: lineData)
continuation.yield(match)
} catch {
// Skip malformed lines
}
}
}
}
// Process remaining buffer
if !buffer.isEmpty {
if let lineString = String(data: buffer, encoding: .utf8), !lineString.isEmpty {
do {
let match = try JSONDecoder().decode(Match.self, from: Data(buffer))
continuation.yield(match)
} catch {
// Skip malformed lines
}
}
}
process.waitUntilExit()
if process.terminationStatus != 0 {
let errData = errHandle.readDataToEndOfFile()
let stderr = String(data: errData, encoding: .utf8) ?? ""
continuation.finish(throwing: mapError(stderr, Int(process.terminationStatus)))
} else {
continuation.finish()
}
} catch {
continuation.finish(throwing: error)
}
}
}
}
/// Gets metadata from a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Base options.
/// - Returns: The document metadata.
/// - Throws: `PdftractError` if operation fails.
public func getMetadata(
_ source: Source
, options: BaseOptions = BaseOptions()
) async throws -> Metadata {
var args = [
"extract", "--metadata-only", "--json"
]
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
let output = try await exec(args)
guard let data = output.data(using: .utf8) else {
throw PdftractError("Failed to decode output", -1)
}
return try JSONDecoder().decode(Metadata.self, from: data)
}
/// Computes a content hash fingerprint of a PDF.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - options: Hash options.
/// - Returns: The document fingerprint.
/// - Throws: `PdftractError` if operation fails.
public func hash(
_ source: Source
, options: HashOptions = HashOptions()
) async throws -> Fingerprint {
var args = [
"hash", "--json"
]
args.append(contentsOf: try source.toArgs())
args.append(contentsOf: options.toArgs())
let output = try await exec(args)
guard let data = output.data(using: .utf8) else {
throw PdftractError("Failed to decode output", -1)
}
return try JSONDecoder().decode(Fingerprint.self, from: data)
}
/// Classifies a PDF document.
/// - Parameters:
/// - source: The PDF source (path, URL, or bytes).
/// - Returns: The classification result.
/// - Throws: `PdftractError` if operation fails.
public func classify(
_ source: Source
) async throws -> Classification {
var args = [
"classify", "--json"
]
args.append(contentsOf: try source.toArgs())
let output = try await exec(args)
guard let data = output.data(using: .utf8) else {
throw PdftractError("Failed to decode output", -1)
}
return try JSONDecoder().decode(Classification.self, from: data)
}
/// Verifies a receipt.
/// - Parameters:
/// - path: Path to the PDF file.
/// - receipt: The receipt data to verify.
/// - Returns: `true` if the receipt is valid, `false` otherwise.
/// - Throws: `PdftractError` if verification fails (not receipt validation failure).
public func verifyReceipt(_ path: String, receipt: Receipt) async throws -> Bool {
let output = try await exec(["verify-receipt", path, receipt.data])
return output.trimmingCharacters(in: .whitespacesAndNewlines) == "true"
}
}