pdftract/pdftract-ruby/lib/pdftract/client.rb
jedarden 246befd8d1 feat(pdftract-2m3gl): implement PHP SDK with Packagist publishing
- Add jedarden/pdftract Composer package (sdk/php/)
- Implement Client.php with proc_open subprocess execution
- Add PSR-3 LoggerInterface integration (defaults to NullLogger)
- Add 9 contract methods: extract, extractText, extractMarkdown, extractStream, search, getMetadata, hash, classify, verifyReceipt
- Add readonly model classes: Document, Page, Metadata, Fingerprint, Classification, Match, Receipt
- Add exception classes: PdftractException base + 8 subclasses
- Add PHPUnit conformance test suite
- Add phpunit.xml configuration
- Add composer.json with jedarden/pdftract package name
- Add .ci/argo-workflows/pdftract-php-publish.yaml (Packagist auto-discovery from git tags)

Also includes Ruby SDK scaffold from parallel workflow.

Closes pdftract-2m3gl
2026-06-01 10:27:03 -04:00

321 lines
9 KiB
Ruby

# frozen_string_literal: true
require 'open3'
require 'json'
require_relative 'errors'
require_relative 'source'
require_relative 'models'
module Pdftract
#
# Client is the main interface for invoking the pdftract CLI.
# All methods execute the pdftract binary as a subprocess and parse the output.
#
class Client
attr_reader :binary_path, :version
def initialize(binary_path = 'pdftract')
@binary_path = binary_path
@version = '1.0.0'
end
#
# Extract structured data from a PDF.
#
# @param source [String, Source] PDF source (file path or Source object)
# @param options [Hash] Extraction options (optional)
# @return [Document] Extracted document with pages and metadata
# @raise [Pdftract::Error] On subprocess error
#
def extract(source, options = nil)
src = normalize_source(source)
args = ['extract', '--json', *src.to_args]
args.concat(options_to_args(options)) if options
output = exec(*args)
ModelConverter.from_hash(JSON.parse(output), Document)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Extract plain text from a PDF.
#
# @param source [String, Source] PDF source
# @param options [Hash] Extraction options (optional)
# @return [String] Plain text content
# @raise [Pdftract::Error] On subprocess error
#
def extract_text(source, options = nil)
src = normalize_source(source)
args = ['extract', '--text', *src.to_args]
args.concat(options_to_args(options)) if options
exec(*args)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Extract Markdown-formatted text from a PDF.
#
# @param source [String, Source] PDF source
# @param options [Hash] Extraction options (optional)
# @return [String] Markdown formatted content
# @raise [Pdftract::Error] On subprocess error
#
def extract_markdown(source, options = nil)
src = normalize_source(source)
args = ['extract', '--md', *src.to_args]
args.concat(options_to_args(options)) if options
exec(*args)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Extract pages from a PDF as a stream.
#
# @param source [String, Source] PDF source
# @param options [Hash] Extraction options (optional)
# @return [Enumerator<Page>] Lazy iterator yielding Page objects
# @raise [Pdftract::Error] On subprocess error
#
def extract_stream(source, options = nil)
src = normalize_source(source)
args = ['extract', '--ndjson', *src.to_args]
args.concat(options_to_args(options)) if options
Open3.popen3(@binary_path, *args) do |stdin, stdout, stderr, wait_thr|
return Enumerator.new do |yielder|
begin
stdout.each_line do |line|
next if line.strip.empty?
page_data = JSON.parse(line)
yielder << ModelConverter.from_hash(page_data, Page)
end
ensure
# Check exit status after consuming all output
status = wait_thr.value
unless status.success?
stderr_text = stderr.read
raise map_error(stderr_text, status.exitstatus)
end
end
end
end
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Search for text in a PDF.
#
# @param source [String, Source] PDF source
# @param pattern [String] Search pattern
# @param options [Hash] Search options (optional)
# @return [Enumerator<Match>] Lazy iterator yielding Match objects
# @raise [Pdftract::Error] On subprocess error
#
def search(source, pattern, options = nil)
src = normalize_source(source)
args = ['grep', pattern, *src.to_args]
args.concat(options_to_args(options, search: true)) if options
Open3.popen3(@binary_path, *args) do |stdin, stdout, stderr, wait_thr|
return Enumerator.new do |yielder|
begin
stdout.each_line do |line|
next if line.strip.empty?
match_data = JSON.parse(line)
yielder << ModelConverter.from_hash(match_data, Match)
end
ensure
# Check exit status after consuming all output
status = wait_thr.value
unless status.success?
stderr_text = stderr.read
raise map_error(stderr_text, status.exitstatus)
end
end
end
end
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Get metadata from a PDF.
#
# @param source [String, Source] PDF source
# @param options [Hash] Options (optional)
# @return [Metadata] Document metadata
# @raise [Pdftract::Error] On subprocess error
#
def get_metadata(source, options = nil)
src = normalize_source(source)
args = ['extract', '--metadata-only', *src.to_args]
args.concat(options_to_args(options)) if options
output = exec(*args)
ModelConverter.from_hash(JSON.parse(output), Metadata)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Compute hash fingerprint of a PDF.
#
# @param source [String, Source] PDF source
# @param options [Hash] Options (optional)
# @return [Fingerprint] Document fingerprint
# @raise [Pdftract::Error] On subprocess error
#
def hash(source, options = nil)
src = normalize_source(source)
args = ['hash', *src.to_args]
args.concat(options_to_args(options)) if options
output = exec(*args)
ModelConverter.from_hash(JSON.parse(output), Fingerprint)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Classify a PDF document.
#
# @param source [String, Source] PDF source
# @return [Classification] Document classification
# @raise [Pdftract::Error] On subprocess error
#
def classify(source)
src = normalize_source(source)
args = ['classify', *src.to_args]
output = exec(*args)
ModelConverter.from_hash(JSON.parse(output), Classification)
ensure
src.cleanup if src.respond_to?(:cleanup)
end
#
# Verify a receipt.
#
# @param pdf_path [String] Path to the PDF file
# @param receipt [String] Path to receipt JSON file, or inline receipt JSON
# @return [Boolean] True if receipt is valid, false otherwise
# @raise [Pdftract::Error] On subprocess error (except verification failures)
#
def verify_receipt(pdf_path, receipt)
# Check if receipt is a file path or inline JSON
if File.exist?(receipt)
args = [pdf_path, receipt]
else
# Inline JSON - pass via --inline flag
args = ['--inline', receipt, pdf_path]
end
stdout, stderr, status = Open3.capture3(@binary_path, 'verify-receipt', *args)
# Exit code 0 means verification succeeded
status.success?
end
private
#
# Execute the pdftract binary and return stdout.
#
def exec(*args)
stdout, stderr, status = Open3.capture3(@binary_path, *args)
unless status.success?
raise map_error(stderr, status.exitstatus)
end
stdout
end
#
# Map exit codes to specific error types.
#
def map_error(stderr, exit_code)
msg = stderr.strip.empty? ? nil : stderr.strip
case exit_code
when 2
CorruptPdfError.new(msg, exit_code, stderr)
when 3
EncryptionError.new(msg, exit_code, stderr)
when 4
SourceUnreachableError.new(msg, exit_code, stderr)
when 5
RemoteFetchInterruptedError.new(msg, exit_code, stderr)
when 6
TlsError.new(msg, exit_code, stderr)
when 10
ReceiptVerifyError.new(msg, exit_code, stderr)
else
Error.new(msg || "Unknown error (exit #{exit_code})", exit_code, stderr)
end
end
#
# Normalize source argument to a Source object.
#
def normalize_source(source)
return source if source.is_a?(Source)
# Check if it's a URL
if source.is_a?(String) && source.start_with?('http://', 'https://')
URLSource.new(source)
else
PathSource.new(source)
end
end
#
# Convert options hash to CLI arguments.
#
def options_to_args(options, search: false)
return [] unless options
args = []
options.each do |key, value|
cli_flag = camel_to_snake(key).to_s.gsub('_', '-')
next if value.nil?
case value
when true
args << "--#{cli_flag}"
when false
# Skip false values
when Array
# Array values (e.g., keywords) - may need special handling
# For now, skip or convert to comma-separated
when Hash
# Skip nested hashes for now
else
args << "--#{cli_flag}=#{value}"
end
end
args
end
#
# Convert camelCase or PascalCase to snake_case.
#
def camel_to_snake(str)
str.to_s
.gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
.gsub(/([a-z\d])([A-Z])/,'\1_\2')
.downcase
end
end
end