The indent trigger was using .abs() which fired on both increased indent (non-indented → indented) AND decreased indent (indented → non-indented). This caused drop-cap style paragraphs (indented first line, flush-left continuation) to incorrectly split into two blocks. Per plan Phase 4.4 heuristic #2, indent change should only trigger when the current line is MORE indented (to the right, larger x0) than the block average - i.e., a new paragraph starting after non-indented text. It should NOT trigger for decreased indent (first line indented, rest flush-left). Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold. Tests: - test_indented_first_line_new_block: PASS (non-indented → indented splits) - test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together) - All 179 line module tests: PASS
557 lines
18 KiB
Swift
557 lines
18 KiB
Swift
//
|
|
// This file is auto-generated. Do not edit manually.
|
|
//
|
|
|
|
#if os(Linux)
|
|
import Foundation
|
|
#else
|
|
import Foundation
|
|
#endif
|
|
|
|
/// Main Pdftract client for extracting data from PDFs.
|
|
/// Uses the bundled pdftract binary via Process spawning.
|
|
public struct Pdftract {
|
|
private let binaryPath: String
|
|
|
|
/// Creates a new Pdftract client.
|
|
/// - Parameter binaryPath: Path to the pdftract binary. If nil, searches PATH.
|
|
public init(binaryPath: String? = nil) {
|
|
if let binaryPath = binaryPath {
|
|
self.binaryPath = binaryPath
|
|
} else {
|
|
// Search PATH for pdftract
|
|
self.binaryPath = Self.findBinary() ?? "pdftract"
|
|
}
|
|
}
|
|
|
|
/// Finds the pdftract binary on PATH.
|
|
private static func findBinary() -> String? {
|
|
#if os(Linux)
|
|
let envPath = ProcessInfo.processInfo.environment["PATH"] ?? ""
|
|
let paths = envPath.split(separator: ":")
|
|
#else
|
|
let envPath = ProcessInfo.processInfo.environment["PATH"] ?? ""
|
|
let paths = envPath.split(separator: ";")
|
|
#endif
|
|
|
|
for path in paths {
|
|
let binaryPath = NSString.path(withComponents: [String(path), "pdftract"])
|
|
if FileManager.default.fileExists(atPath: binaryPath) {
|
|
return binaryPath
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
/// Executes the pdftract binary with the given arguments.
|
|
/// - Parameter args: Command-line arguments to pass.
|
|
/// - Returns: The stdout output as a String.
|
|
/// - Throws: `PdftractError` if the process fails.
|
|
private func exec(_ args: [String]) async throws -> String {
|
|
let process = Process()
|
|
process.executableURL = URL(fileURLWithPath: binaryPath)
|
|
|
|
let outPipe = Pipe()
|
|
let errPipe = Pipe()
|
|
process.standardOutput = outPipe
|
|
process.standardError = errPipe
|
|
process.arguments = args
|
|
|
|
do {
|
|
try process.run()
|
|
process.waitUntilExit()
|
|
|
|
let outData = outPipe.fileHandleForReading.readDataToEndOfFile()
|
|
let errData = errPipe.fileHandleForReading.readDataToEndOfFile()
|
|
|
|
let output = String(data: outData, encoding: .utf8) ?? ""
|
|
let stderr = String(data: errData, encoding: .utf8) ?? ""
|
|
|
|
guard process.terminationStatus == 0 else {
|
|
throw mapError(stderr, Int(process.terminationStatus))
|
|
}
|
|
|
|
return output
|
|
} catch let error as PdftractError {
|
|
throw error
|
|
} catch {
|
|
throw PdftractError("Failed to execute pdftract: \(error.localizedDescription)", -1)
|
|
}
|
|
}
|
|
|
|
/// Maps CLI exit codes to Swift errors.
|
|
/// - Parameters:
|
|
/// - stderr: The stderr output from the process.
|
|
/// - exitCode: The exit code.
|
|
/// - Returns: A `PdftractError` subclass.
|
|
private func mapError(_ stderr: String, _ exitCode: Int) -> PdftractError {
|
|
guard let exitCode = exitCode else {
|
|
return PdftractError(stderr, -1)
|
|
}
|
|
|
|
switch exitCode {
|
|
|
|
|
|
case 2:
|
|
return CorruptPdfError(stderr, exitCode)
|
|
|
|
|
|
|
|
case 3:
|
|
return EncryptionError(stderr, exitCode)
|
|
|
|
|
|
|
|
case 4:
|
|
return SourceUnreachableError(stderr, exitCode)
|
|
|
|
|
|
|
|
case 5:
|
|
return RemoteFetchInterruptedError(stderr, exitCode)
|
|
|
|
|
|
|
|
case 6:
|
|
return TlsError(stderr, exitCode)
|
|
|
|
|
|
|
|
case 10:
|
|
return ReceiptVerifyError(stderr, exitCode)
|
|
|
|
|
|
default:
|
|
return PdftractError(stderr, exitCode)
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Extracts structured data from a PDF.
|
|
/// - Parameters:
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Extraction options.
|
|
/// - Returns: The complete document structure.
|
|
/// - Throws: `PdftractError` if extraction fails.
|
|
public func extract(
|
|
_ source: Source,
|
|
options: ExtractOptions = ExtractOptions()
|
|
) async throws -> Document {
|
|
var args = ["extract", "--json"]
|
|
args.append(contentsOf: try source.toArgs())
|
|
args.append(contentsOf: options.toArgs())
|
|
|
|
let output = try await exec(args)
|
|
|
|
guard let data = output.data(using: .utf8) else {
|
|
throw PdftractError("Failed to decode output", -1)
|
|
}
|
|
|
|
return try JSONDecoder().decode(Document.self, from: data)
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Extracts plain text from a PDF.
|
|
|
|
/// - Parameters:
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Extraction options.
|
|
/// - Returns: The extracted text.
|
|
/// - Throws: `PdftractError` if extraction fails.
|
|
public func extractText(
|
|
_ source: Source,
|
|
options: ExtractOptions = ExtractOptions()
|
|
) async throws -> String {
|
|
var args = ["extract"]
|
|
args.append(contentsOf: try source.toArgs())
|
|
args.append(contentsOf: options.toArgs())
|
|
|
|
args.append("--text")
|
|
|
|
args.append("--json")
|
|
|
|
let output = try await exec(args)
|
|
|
|
// Parse JSON to verify it's valid, then extract the text field
|
|
guard let data = output.data(using: .utf8),
|
|
let doc = try? JSONDecoder().decode(Document.self, from: data) else {
|
|
throw PdftractError("Failed to decode JSON output", -1)
|
|
}
|
|
|
|
// Return concatenated page text
|
|
return doc.pages.map { page in
|
|
page.blocks.map { $0.text }.joined(separator: "\n")
|
|
}.joined(separator: "\n\n")
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Extracts Markdown-formatted text from a PDF.
|
|
|
|
/// - Parameters:
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Extraction options.
|
|
/// - Returns: The extracted text.
|
|
/// - Throws: `PdftractError` if extraction fails.
|
|
public func extractMarkdown(
|
|
_ source: Source,
|
|
options: ExtractOptions = ExtractOptions()
|
|
) async throws -> String {
|
|
var args = ["extract"]
|
|
args.append(contentsOf: try source.toArgs())
|
|
args.append(contentsOf: options.toArgs())
|
|
|
|
args.append("--md")
|
|
|
|
args.append("--json")
|
|
|
|
let output = try await exec(args)
|
|
|
|
// Parse JSON to verify it's valid, then extract the text field
|
|
guard let data = output.data(using: .utf8),
|
|
let doc = try? JSONDecoder().decode(Document.self, from: data) else {
|
|
throw PdftractError("Failed to decode JSON output", -1)
|
|
}
|
|
|
|
// Return concatenated page text
|
|
return doc.pages.map { page in
|
|
page.blocks.map { $0.text }.joined(separator: "\n")
|
|
}.joined(separator: "\n\n")
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Extracts pages from a PDF as an async stream.
|
|
/// - Parameters:
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Extraction options.
|
|
/// - Returns: An `AsyncThrowingStream` that yields `Page` values.
|
|
/// - Throws: `PdftractError` if extraction fails.
|
|
public func extractStream(
|
|
_ source: Source,
|
|
options: ExtractOptions = ExtractOptions()
|
|
) -> AsyncThrowingStream<Page, Error> {
|
|
return AsyncThrowingStream { continuation in
|
|
Task {
|
|
var args = ["extract", "--ndjson"]
|
|
do {
|
|
args.append(contentsOf: try source.toArgs())
|
|
args.append(contentsOf: options.toArgs())
|
|
} catch {
|
|
continuation.finish(throwing: error)
|
|
return
|
|
}
|
|
|
|
let process = Process()
|
|
process.executableURL = URL(fileURLWithPath: binaryPath)
|
|
|
|
let outPipe = Pipe()
|
|
let errPipe = Pipe()
|
|
process.standardOutput = outPipe
|
|
process.standardError = errPipe
|
|
process.arguments = args
|
|
|
|
// Handle cancellation
|
|
continuation.onTermination = { @Sendable _ in
|
|
process.terminate()
|
|
_ = try? process.waitUntilExit()
|
|
}
|
|
|
|
do {
|
|
try process.run()
|
|
|
|
let outHandle = outPipe.fileHandleForReading
|
|
let errHandle = errPipe.fileHandleForReading
|
|
|
|
// Read lines incrementally
|
|
var buffer = [UInt8]()
|
|
let readSize = 4096
|
|
|
|
while process.isRunning {
|
|
let data = outHandle.readData(ofLength: readSize)
|
|
if data.isEmpty {
|
|
break
|
|
}
|
|
|
|
buffer.append(contentsOf: data)
|
|
|
|
// Process complete lines
|
|
while let newlineIndex = buffer.firstIndex(of: 0x0A) {
|
|
let lineData = Data(buffer[..<newlineIndex])
|
|
buffer.removeSubrange(0...newlineIndex)
|
|
|
|
if let lineString = String(data: lineData, encoding: .utf8), !lineString.isEmpty {
|
|
do {
|
|
let page = try JSONDecoder().decode(Page.self, from: lineData)
|
|
continuation.yield(page)
|
|
} catch {
|
|
// Skip malformed lines; the final error will be reported if needed
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process remaining buffer
|
|
if !buffer.isEmpty {
|
|
if let lineString = String(data: buffer, encoding: .utf8), !lineString.isEmpty {
|
|
do {
|
|
let page = try JSONDecoder().decode(Page.self, from: Data(buffer))
|
|
continuation.yield(page)
|
|
} catch {
|
|
// Skip malformed lines
|
|
}
|
|
}
|
|
}
|
|
|
|
process.waitUntilExit()
|
|
|
|
if process.terminationStatus != 0 {
|
|
let errData = errHandle.readDataToEndOfFile()
|
|
let stderr = String(data: errData, encoding: .utf8) ?? ""
|
|
continuation.finish(throwing: mapError(stderr, Int(process.terminationStatus)))
|
|
} else {
|
|
continuation.finish()
|
|
}
|
|
} catch {
|
|
continuation.finish(throwing: error)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Searches for text in a PDF.
|
|
/// - Parameters:
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - pattern: The text pattern to search for.
|
|
/// - options: Search options.
|
|
/// - Returns: An `AsyncThrowingStream` that yields `Match` values.
|
|
/// - Throws: `PdftractError` if search fails.
|
|
public func search(
|
|
_ source: Source,
|
|
_ pattern: String,
|
|
options: SearchOptions = SearchOptions()
|
|
) -> AsyncThrowingStream<Match, Error> {
|
|
return AsyncThrowingStream { continuation in
|
|
Task {
|
|
var args = ["grep", pattern]
|
|
do {
|
|
args.append(contentsOf: try source.toArgs())
|
|
args.append(contentsOf: options.toArgs())
|
|
} catch {
|
|
continuation.finish(throwing: error)
|
|
return
|
|
}
|
|
|
|
let process = Process()
|
|
process.executableURL = URL(fileURLWithPath: binaryPath)
|
|
|
|
let outPipe = Pipe()
|
|
let errPipe = Pipe()
|
|
process.standardOutput = outPipe
|
|
process.standardError = errPipe
|
|
process.arguments = args
|
|
|
|
// Handle cancellation
|
|
continuation.onTermination = { @Sendable _ in
|
|
process.terminate()
|
|
_ = try? process.waitUntilExit()
|
|
}
|
|
|
|
do {
|
|
try process.run()
|
|
|
|
let outHandle = outPipe.fileHandleForReading
|
|
let errHandle = errPipe.fileHandleForReading
|
|
|
|
// Read lines incrementally
|
|
var buffer = [UInt8]()
|
|
let readSize = 4096
|
|
|
|
while process.isRunning {
|
|
let data = outHandle.readData(ofLength: readSize)
|
|
if data.isEmpty {
|
|
break
|
|
}
|
|
|
|
buffer.append(contentsOf: data)
|
|
|
|
// Process complete lines
|
|
while let newlineIndex = buffer.firstIndex(of: 0x0A) {
|
|
let lineData = Data(buffer[..<newlineIndex])
|
|
buffer.removeSubrange(0...newlineIndex)
|
|
|
|
if let lineString = String(data: lineData, encoding: .utf8), !lineString.isEmpty {
|
|
do {
|
|
let match = try JSONDecoder().decode(Match.self, from: lineData)
|
|
continuation.yield(match)
|
|
} catch {
|
|
// Skip malformed lines
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process remaining buffer
|
|
if !buffer.isEmpty {
|
|
if let lineString = String(data: buffer, encoding: .utf8), !lineString.isEmpty {
|
|
do {
|
|
let match = try JSONDecoder().decode(Match.self, from: Data(buffer))
|
|
continuation.yield(match)
|
|
} catch {
|
|
// Skip malformed lines
|
|
}
|
|
}
|
|
}
|
|
|
|
process.waitUntilExit()
|
|
|
|
if process.terminationStatus != 0 {
|
|
let errData = errHandle.readDataToEndOfFile()
|
|
let stderr = String(data: errData, encoding: .utf8) ?? ""
|
|
continuation.finish(throwing: mapError(stderr, Int(process.terminationStatus)))
|
|
} else {
|
|
continuation.finish()
|
|
}
|
|
} catch {
|
|
continuation.finish(throwing: error)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Gets metadata from a PDF.
|
|
|
|
/// - Parameters:
|
|
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Base options.
|
|
/// - Returns: The document metadata.
|
|
|
|
/// - Throws: `PdftractError` if operation fails.
|
|
public func getMetadata(
|
|
_ source: Source
|
|
|
|
, options: BaseOptions = BaseOptions()
|
|
|
|
) async throws -> Metadata {
|
|
var args = [
|
|
|
|
"extract", "--metadata-only", "--json"
|
|
|
|
]
|
|
args.append(contentsOf: try source.toArgs())
|
|
|
|
args.append(contentsOf: options.toArgs())
|
|
|
|
|
|
let output = try await exec(args)
|
|
|
|
guard let data = output.data(using: .utf8) else {
|
|
throw PdftractError("Failed to decode output", -1)
|
|
}
|
|
|
|
return try JSONDecoder().decode(Metadata.self, from: data)
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Computes a content hash fingerprint of a PDF.
|
|
|
|
/// - Parameters:
|
|
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - options: Hash options.
|
|
/// - Returns: The document fingerprint.
|
|
|
|
/// - Throws: `PdftractError` if operation fails.
|
|
public func hash(
|
|
_ source: Source
|
|
|
|
, options: HashOptions = HashOptions()
|
|
|
|
) async throws -> Fingerprint {
|
|
var args = [
|
|
|
|
"hash", "--json"
|
|
|
|
]
|
|
args.append(contentsOf: try source.toArgs())
|
|
|
|
args.append(contentsOf: options.toArgs())
|
|
|
|
|
|
let output = try await exec(args)
|
|
|
|
guard let data = output.data(using: .utf8) else {
|
|
throw PdftractError("Failed to decode output", -1)
|
|
}
|
|
|
|
return try JSONDecoder().decode(Fingerprint.self, from: data)
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Classifies a PDF document.
|
|
|
|
/// - Parameters:
|
|
|
|
/// - source: The PDF source (path, URL, or bytes).
|
|
/// - Returns: The classification result.
|
|
|
|
/// - Throws: `PdftractError` if operation fails.
|
|
public func classify(
|
|
_ source: Source
|
|
|
|
) async throws -> Classification {
|
|
var args = [
|
|
|
|
"classify", "--json"
|
|
|
|
]
|
|
args.append(contentsOf: try source.toArgs())
|
|
|
|
|
|
let output = try await exec(args)
|
|
|
|
guard let data = output.data(using: .utf8) else {
|
|
throw PdftractError("Failed to decode output", -1)
|
|
}
|
|
|
|
return try JSONDecoder().decode(Classification.self, from: data)
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Verifies a receipt.
|
|
/// - Parameters:
|
|
/// - path: Path to the PDF file.
|
|
/// - receipt: The receipt data to verify.
|
|
/// - Returns: `true` if the receipt is valid, `false` otherwise.
|
|
/// - Throws: `PdftractError` if verification fails (not receipt validation failure).
|
|
public func verifyReceipt(_ path: String, receipt: Receipt) async throws -> Bool {
|
|
let output = try await exec(["verify-receipt", path, receipt.data])
|
|
return output.trimmingCharacters(in: .whitespacesAndNewlines) == "true"
|
|
}
|
|
|
|
|
|
|
|
}
|