Four research documents covering PDF spec fundamentals, font types and encoding, glyph Unicode recovery, and tagged PDF structure/reading order. SDK invocation notes with subprocess and HTTP examples for Python, Node.js, Go, Ruby, Java, Rust, and Bash. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
29 KiB
pdftract SDK Invocation Guide
How to invoke the pdftract binary from various languages, both via subprocess and via the HTTP server mode.
Binary Modes Reference
pdftract extract <file.pdf> # JSON to stdout
pdftract extract <file.pdf> --text # plain text to stdout
pdftract extract <file.pdf> --output out.json # JSON to file
pdftract serve --port 8080 # HTTP server: POST /extract → JSON
JSON Output Schema
{
"pages": [
{
"page": 1,
"spans": [
{
"text": "Hello world",
"bbox": [x0, y0, x1, y1],
"font": "Helvetica",
"size": 12.0,
"confidence": 0.98
}
],
"blocks": [
{
"kind": "paragraph",
"text": "Hello world",
"bbox": [x0, y0, x1, y1]
}
]
}
],
"metadata": {
"title": "...",
"author": "...",
"page_count": 10
}
}
1. Python
When to prefer subprocess: one-off scripts, CLI pipelines, or when starting the server is not worth the overhead. When to prefer HTTP: long-running services, parallel extraction across many files, or when sharing a single pdftract instance across multiple workers.
Subprocess
import subprocess
import json
import sys
def extract_pdf_subprocess(pdf_path: str) -> dict:
"""Extract text from a PDF via subprocess and return the parsed JSON result."""
result = subprocess.run(
["pdftract", "extract", pdf_path],
capture_output=True,
text=True,
)
if result.returncode != 0:
raise RuntimeError(
f"pdftract failed (exit {result.returncode}): {result.stderr.strip()}"
)
return json.loads(result.stdout)
def full_text(data: dict) -> str:
"""Concatenate all block text across every page."""
parts = []
for page in data["pages"]:
for block in page["blocks"]:
parts.append(block["text"])
return "\n".join(parts)
def page_text(data: dict, page_number: int) -> str:
"""Return concatenated block text for a single page (1-indexed)."""
for page in data["pages"]:
if page["page"] == page_number:
return "\n".join(block["text"] for block in page["blocks"])
raise ValueError(f"Page {page_number} not found")
if __name__ == "__main__":
pdf = sys.argv[1]
data = extract_pdf_subprocess(pdf)
print(f"Title : {data['metadata'].get('title', '(none)')}")
print(f"Pages : {data['metadata']['page_count']}")
print()
print("--- Full text ---")
print(full_text(data))
print()
print("--- Page 1 text ---")
print(page_text(data, 1))
HTTP (requests / httpx)
# pip install requests
# pip install httpx # async alternative shown below
import requests
import json
PDFTRACT_URL = "http://localhost:8080"
def extract_pdf_http(pdf_path: str) -> dict:
"""POST a PDF file to pdftract serve and return the parsed JSON result."""
with open(pdf_path, "rb") as f:
response = requests.post(
f"{PDFTRACT_URL}/extract",
files={"file": (pdf_path, f, "application/pdf")},
timeout=60,
)
response.raise_for_status()
return response.json()
def full_text(data: dict) -> str:
parts = []
for page in data["pages"]:
for block in page["blocks"]:
parts.append(block["text"])
return "\n".join(parts)
def page_text(data: dict, page_number: int) -> str:
for page in data["pages"]:
if page["page"] == page_number:
return "\n".join(block["text"] for block in page["blocks"])
raise ValueError(f"Page {page_number} not found")
# --- Async variant with httpx ---
import asyncio
import httpx
async def extract_pdf_async(pdf_path: str) -> dict:
async with httpx.AsyncClient(timeout=60) as client:
with open(pdf_path, "rb") as f:
response = await client.post(
f"{PDFTRACT_URL}/extract",
files={"file": (pdf_path, f, "application/pdf")},
)
response.raise_for_status()
return response.json()
if __name__ == "__main__":
import sys
pdf = sys.argv[1]
# Synchronous
data = extract_pdf_http(pdf)
print(full_text(data))
# Asynchronous
data = asyncio.run(extract_pdf_async(pdf))
print(full_text(data))
2. Node.js / JavaScript
When to prefer subprocess: build scripts, one-off tooling, or serverless functions where spinning up a child process is acceptable. When to prefer HTTP: Express/Fastify services, or when pdftract is deployed as a sidecar or shared microservice.
Subprocess (child_process)
// Node.js 18+ (ESM)
import { execFile } from "node:child_process";
import { promisify } from "node:util";
const execFileAsync = promisify(execFile);
/**
* Extract text from a PDF via subprocess.
* @param {string} pdfPath
* @returns {Promise<object>} Parsed pdftract JSON
*/
async function extractPdfSubprocess(pdfPath) {
const { stdout, stderr } = await execFileAsync("pdftract", [
"extract",
pdfPath,
]).catch((err) => {
throw new Error(`pdftract failed (exit ${err.code}): ${err.stderr}`);
});
return JSON.parse(stdout);
}
/** Concatenate all block text across every page. */
function fullText(data) {
return data.pages
.flatMap((page) => page.blocks.map((b) => b.text))
.join("\n");
}
/** Return concatenated block text for a single page (1-indexed). */
function pageText(data, pageNumber) {
const page = data.pages.find((p) => p.page === pageNumber);
if (!page) throw new Error(`Page ${pageNumber} not found`);
return page.blocks.map((b) => b.text).join("\n");
}
// Usage
const data = await extractPdfSubprocess(process.argv[2]);
console.log("Title :", data.metadata.title ?? "(none)");
console.log("Pages :", data.metadata.page_count);
console.log("\n--- Full text ---");
console.log(fullText(data));
console.log("\n--- Page 1 ---");
console.log(pageText(data, 1));
HTTP (native fetch)
// Node.js 18+ — fetch is available globally; no extra dependencies required.
import { readFile } from "node:fs/promises";
const PDFTRACT_URL = "http://localhost:8080";
/**
* POST a PDF to pdftract serve.
* @param {string} pdfPath
* @returns {Promise<object>} Parsed pdftract JSON
*/
async function extractPdfHttp(pdfPath) {
const bytes = await readFile(pdfPath);
const blob = new Blob([bytes], { type: "application/pdf" });
const form = new FormData();
form.append("file", blob, pdfPath);
const res = await fetch(`${PDFTRACT_URL}/extract`, {
method: "POST",
body: form,
});
if (!res.ok) {
const body = await res.text();
throw new Error(`pdftract HTTP ${res.status}: ${body}`);
}
return res.json();
}
function fullText(data) {
return data.pages
.flatMap((page) => page.blocks.map((b) => b.text))
.join("\n");
}
function pageText(data, pageNumber) {
const page = data.pages.find((p) => p.page === pageNumber);
if (!page) throw new Error(`Page ${pageNumber} not found`);
return page.blocks.map((b) => b.text).join("\n");
}
// Usage
const data = await extractPdfHttp(process.argv[2]);
console.log(fullText(data));
3. Go
When to prefer subprocess: CLI utilities or single-binary deployments where you want zero network overhead. When to prefer HTTP: Go services handling concurrent requests — spin up pdftract serve once and hit it from multiple goroutines.
Subprocess (os/exec)
package main
import (
"encoding/json"
"fmt"
"log"
"os"
"os/exec"
"strings"
)
type Span struct {
Text string `json:"text"`
BBox [4]float64 `json:"bbox"`
Font string `json:"font"`
Size float64 `json:"size"`
Confidence float64 `json:"confidence"`
}
type Block struct {
Kind string `json:"kind"`
Text string `json:"text"`
BBox [4]float64 `json:"bbox"`
}
type Page struct {
Page int `json:"page"`
Spans []Span `json:"spans"`
Blocks []Block `json:"blocks"`
}
type Metadata struct {
Title string `json:"title"`
Author string `json:"author"`
PageCount int `json:"page_count"`
}
type PDFTractResult struct {
Pages []Page `json:"pages"`
Metadata Metadata `json:"metadata"`
}
// extractSubprocess runs `pdftract extract <path>` and returns the parsed result.
func extractSubprocess(pdfPath string) (*PDFTractResult, error) {
out, err := exec.Command("pdftract", "extract", pdfPath).Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
return nil, fmt.Errorf("pdftract failed: %s", string(exitErr.Stderr))
}
return nil, fmt.Errorf("exec error: %w", err)
}
var result PDFTractResult
if err := json.Unmarshal(out, &result); err != nil {
return nil, fmt.Errorf("json parse error: %w", err)
}
return &result, nil
}
// FullText concatenates all block text across every page.
func (r *PDFTractResult) FullText() string {
var sb strings.Builder
for _, page := range r.Pages {
for _, block := range page.Blocks {
sb.WriteString(block.Text)
sb.WriteByte('\n')
}
}
return sb.String()
}
// PageText returns concatenated block text for a single page (1-indexed).
func (r *PDFTractResult) PageText(pageNumber int) (string, error) {
for _, page := range r.Pages {
if page.Page == pageNumber {
var sb strings.Builder
for _, block := range page.Blocks {
sb.WriteString(block.Text)
sb.WriteByte('\n')
}
return sb.String(), nil
}
}
return "", fmt.Errorf("page %d not found", pageNumber)
}
func main() {
if len(os.Args) < 2 {
log.Fatal("usage: program <file.pdf>")
}
result, err := extractSubprocess(os.Args[1])
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
fmt.Printf("Title : %s\n", result.Metadata.Title)
fmt.Printf("Pages : %d\n", result.Metadata.PageCount)
fmt.Println("\n--- Full text ---")
fmt.Println(result.FullText())
p1, err := result.PageText(1)
if err != nil {
log.Printf("page 1: %v", err)
} else {
fmt.Println("--- Page 1 ---")
fmt.Println(p1)
}
}
HTTP (net/http)
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"mime/multipart"
"net/http"
"os"
"path/filepath"
)
const pdftractURL = "http://localhost:8080"
// extractHTTP POSTs a PDF file to pdftract serve.
func extractHTTP(pdfPath string) (*PDFTractResult, error) {
f, err := os.Open(pdfPath)
if err != nil {
return nil, fmt.Errorf("open file: %w", err)
}
defer f.Close()
var buf bytes.Buffer
mw := multipart.NewWriter(&buf)
part, err := mw.CreateFormFile("file", filepath.Base(pdfPath))
if err != nil {
return nil, fmt.Errorf("create form file: %w", err)
}
if _, err := io.Copy(part, f); err != nil {
return nil, fmt.Errorf("copy file: %w", err)
}
mw.Close()
resp, err := http.Post(
pdftractURL+"/extract",
mw.FormDataContentType(),
&buf,
)
if err != nil {
return nil, fmt.Errorf("http post: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("pdftract HTTP %d: %s", resp.StatusCode, body)
}
var result PDFTractResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, fmt.Errorf("json decode: %w", err)
}
return &result, nil
}
func main() {
if len(os.Args) < 2 {
log.Fatal("usage: program <file.pdf>")
}
result, err := extractHTTP(os.Args[1])
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
fmt.Println(result.FullText())
}
4. Ruby
When to prefer subprocess: Rake tasks, standalone scripts, or Rails background jobs without a persistent pdftract process. When to prefer HTTP: Sidekiq workers or Rails requests — keep pdftract serve running as a separate process and hit it over loopback.
Subprocess (Open3)
require "open3"
require "json"
# Extract text from a PDF via subprocess.
# Returns a Hash parsed from pdftract's JSON output.
def extract_pdf_subprocess(pdf_path)
stdout, stderr, status = Open3.capture3("pdftract", "extract", pdf_path)
unless status.success?
raise "pdftract failed (exit #{status.exitstatus}): #{stderr.strip}"
end
JSON.parse(stdout)
end
# Concatenate all block text across every page.
def full_text(data)
data["pages"]
.flat_map { |page| page["blocks"].map { |b| b["text"] } }
.join("\n")
end
# Return concatenated block text for a single page (1-indexed).
def page_text(data, page_number)
page = data["pages"].find { |p| p["page"] == page_number }
raise "Page #{page_number} not found" unless page
page["blocks"].map { |b| b["text"] }.join("\n")
end
# Usage
pdf_path = ARGV[0] || raise("Usage: ruby extract.rb <file.pdf>")
data = extract_pdf_subprocess(pdf_path)
puts "Title : #{data.dig("metadata", "title") || "(none)"}"
puts "Pages : #{data.dig("metadata", "page_count")}"
puts
puts "--- Full text ---"
puts full_text(data)
puts
puts "--- Page 1 ---"
puts page_text(data, 1)
HTTP (net/http)
require "net/http"
require "json"
PDFTRACT_URL = URI("http://localhost:8080/extract")
# POST a PDF file to pdftract serve.
def extract_pdf_http(pdf_path)
boundary = "----pdftract#{rand(0xFFFFFF).to_s(16)}"
body = build_multipart(pdf_path, boundary)
http = Net::HTTP.new(PDFTRACT_URL.host, PDFTRACT_URL.port)
http.read_timeout = 60
request = Net::HTTP::Post.new(PDFTRACT_URL.path)
request["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
request.body = body
response = http.request(request)
raise "pdftract HTTP #{response.code}: #{response.body}" unless response.is_a?(Net::HTTPSuccess)
JSON.parse(response.body)
end
def build_multipart(pdf_path, boundary)
crlf = "\r\n"
pdf_data = File.binread(pdf_path)
filename = File.basename(pdf_path)
[
"--#{boundary}#{crlf}",
"Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"#{crlf}",
"Content-Type: application/pdf#{crlf}",
crlf,
pdf_data,
"#{crlf}--#{boundary}--#{crlf}",
].join
end
def full_text(data)
data["pages"]
.flat_map { |page| page["blocks"].map { |b| b["text"] } }
.join("\n")
end
def page_text(data, page_number)
page = data["pages"].find { |p| p["page"] == page_number }
raise "Page #{page_number} not found" unless page
page["blocks"].map { |b| b["text"] }.join("\n")
end
# Usage
pdf_path = ARGV[0] || raise("Usage: ruby extract_http.rb <file.pdf>")
data = extract_pdf_http(pdf_path)
puts full_text(data)
5. Java
When to prefer subprocess: batch jobs or standalone utilities. ProcessBuilder is simple and avoids a network stack. When to prefer HTTP: Spring Boot services or multi-threaded apps — pdftract serve handles concurrent requests, while subprocess creates a new process per call.
Requires Java 11+. No external dependencies — uses only the standard library.
Subprocess (ProcessBuilder)
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Invokes pdftract via subprocess and parses the JSON result.
*
* Dependency (Maven):
* <dependency>
* <groupId>com.fasterxml.jackson.core</groupId>
* <artifactId>jackson-databind</artifactId>
* <version>2.17.0</version>
* </dependency>
*
* If you prefer no dependencies, replace ObjectMapper with org.json or
* a manual string parse — the structure is straightforward.
*/
public class PdftractSubprocess {
private static final ObjectMapper MAPPER = new ObjectMapper();
public static JsonNode extract(String pdfPath) throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder("pdftract", "extract", pdfPath);
pb.redirectErrorStream(false); // keep stderr separate
Process process = pb.start();
byte[] stdout = process.getInputStream().readAllBytes();
byte[] stderr = process.getErrorStream().readAllBytes();
int exit = process.waitFor();
if (exit != 0) {
throw new IOException(
"pdftract failed (exit " + exit + "): " + new String(stderr).strip()
);
}
return MAPPER.readTree(stdout);
}
/** Concatenate all block text across every page. */
public static String fullText(JsonNode data) {
List<String> parts = new ArrayList<>();
for (JsonNode page : data.get("pages")) {
for (JsonNode block : page.get("blocks")) {
parts.add(block.get("text").asText());
}
}
return String.join("\n", parts);
}
/** Return concatenated block text for a single page (1-indexed). */
public static String pageText(JsonNode data, int pageNumber) {
for (JsonNode page : data.get("pages")) {
if (page.get("page").asInt() == pageNumber) {
List<String> parts = new ArrayList<>();
for (JsonNode block : page.get("blocks")) {
parts.add(block.get("text").asText());
}
return String.join("\n", parts);
}
}
throw new IllegalArgumentException("Page " + pageNumber + " not found");
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: PdftractSubprocess <file.pdf>");
System.exit(1);
}
JsonNode data = extract(args[0]);
JsonNode meta = data.get("metadata");
System.out.println("Title : " + meta.path("title").asText("(none)"));
System.out.println("Pages : " + meta.get("page_count").asInt());
System.out.println("\n--- Full text ---");
System.out.println(fullText(data));
System.out.println("\n--- Page 1 ---");
System.out.println(pageText(data, 1));
}
}
HTTP (java.net.http.HttpClient, Java 11+)
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
public class PdftractHttp {
private static final String PDFTRACT_URL = "http://localhost:8080";
private static final ObjectMapper MAPPER = new ObjectMapper();
private static final HttpClient CLIENT = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(10))
.build();
public static JsonNode extract(String pdfPath) throws IOException, InterruptedException {
Path path = Path.of(pdfPath);
byte[] pdfBytes = Files.readAllBytes(path);
String filename = path.getFileName().toString();
String boundary = UUID.randomUUID().toString().replace("-", "");
// Build multipart/form-data body manually (no external library needed)
String crlf = "\r\n";
byte[] partHeader = (
"--" + boundary + crlf
+ "Content-Disposition: form-data; name=\"file\"; filename=\"" + filename + "\"" + crlf
+ "Content-Type: application/pdf" + crlf
+ crlf
).getBytes();
byte[] partFooter = (crlf + "--" + boundary + "--" + crlf).getBytes();
byte[] body = new byte[partHeader.length + pdfBytes.length + partFooter.length];
System.arraycopy(partHeader, 0, body, 0, partHeader.length);
System.arraycopy(pdfBytes, 0, body, partHeader.length, pdfBytes.length);
System.arraycopy(partFooter, 0, body, partHeader.length + pdfBytes.length, partFooter.length);
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(PDFTRACT_URL + "/extract"))
.timeout(Duration.ofSeconds(60))
.header("Content-Type", "multipart/form-data; boundary=" + boundary)
.POST(HttpRequest.BodyPublishers.ofByteArray(body))
.build();
HttpResponse<String> response = CLIENT.send(
request, HttpResponse.BodyHandlers.ofString()
);
if (response.statusCode() != 200) {
throw new IOException(
"pdftract HTTP " + response.statusCode() + ": " + response.body()
);
}
return MAPPER.readTree(response.body());
}
public static String fullText(JsonNode data) {
List<String> parts = new ArrayList<>();
for (JsonNode page : data.get("pages")) {
for (JsonNode block : page.get("blocks")) {
parts.add(block.get("text").asText());
}
}
return String.join("\n", parts);
}
public static String pageText(JsonNode data, int pageNumber) {
for (JsonNode page : data.get("pages")) {
if (page.get("page").asInt() == pageNumber) {
List<String> parts = new ArrayList<>();
for (JsonNode block : page.get("blocks")) {
parts.add(block.get("text").asText());
}
return String.join("\n", parts);
}
}
throw new IllegalArgumentException("Page " + pageNumber + " not found");
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: PdftractHttp <file.pdf>");
System.exit(1);
}
JsonNode data = extract(args[0]);
System.out.println(fullText(data));
}
}
6. Rust
When to prefer subprocess: CLI tools or single-threaded batch processors — zero extra dependencies beyond
serde_json. When to prefer HTTP: Async Tokio services —reqwestis non-blocking and naturally fits async Rust workloads.
Subprocess (std::process::Command)
Add to Cargo.toml:
[dependencies]
serde = { version = "1", features = ["derive"] }
serde_json = "1"
use serde::Deserialize;
use std::process::Command;
#[derive(Debug, Deserialize)]
struct Span {
pub text: String,
pub bbox: [f64; 4],
pub font: String,
pub size: f64,
pub confidence: f64,
}
#[derive(Debug, Deserialize)]
struct Block {
pub kind: String,
pub text: String,
pub bbox: [f64; 4],
}
#[derive(Debug, Deserialize)]
struct Page {
pub page: u32,
pub spans: Vec<Span>,
pub blocks: Vec<Block>,
}
#[derive(Debug, Deserialize)]
struct Metadata {
pub title: Option<String>,
pub author: Option<String>,
pub page_count: u32,
}
#[derive(Debug, Deserialize)]
struct PdftractResult {
pub pages: Vec<Page>,
pub metadata: Metadata,
}
impl PdftractResult {
/// Concatenate all block text across every page.
pub fn full_text(&self) -> String {
self.pages
.iter()
.flat_map(|p| p.blocks.iter().map(|b| b.text.as_str()))
.collect::<Vec<_>>()
.join("\n")
}
/// Return concatenated block text for a single page (1-indexed).
pub fn page_text(&self, page_number: u32) -> Option<String> {
self.pages
.iter()
.find(|p| p.page == page_number)
.map(|p| {
p.blocks
.iter()
.map(|b| b.text.as_str())
.collect::<Vec<_>>()
.join("\n")
})
}
}
fn extract_subprocess(pdf_path: &str) -> Result<PdftractResult, Box<dyn std::error::Error>> {
let output = Command::new("pdftract")
.args(["extract", pdf_path])
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(format!(
"pdftract failed (exit {:?}): {}",
output.status.code(),
stderr.trim()
)
.into());
}
let result: PdftractResult = serde_json::from_slice(&output.stdout)?;
Ok(result)
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let pdf_path = std::env::args()
.nth(1)
.ok_or("usage: program <file.pdf>")?;
let result = extract_subprocess(&pdf_path)?;
println!("Title : {}", result.metadata.title.as_deref().unwrap_or("(none)"));
println!("Pages : {}", result.metadata.page_count);
println!("\n--- Full text ---");
println!("{}", result.full_text());
if let Some(text) = result.page_text(1) {
println!("\n--- Page 1 ---");
println!("{text}");
}
Ok(())
}
HTTP (reqwest)
Add to Cargo.toml:
[dependencies]
serde = { version = "1", features = ["derive"] }
serde_json = "1"
reqwest = { version = "0.12", features = ["multipart"] }
tokio = { version = "1", features = ["full"] }
use reqwest::multipart;
use serde::Deserialize;
use std::path::Path;
// Re-use the same structs from the subprocess example above.
// (PdftractResult, Page, Block, Span, Metadata — copy them in)
const PDFTRACT_URL: &str = "http://localhost:8080";
async fn extract_http(pdf_path: &str) -> Result<PdftractResult, Box<dyn std::error::Error>> {
let bytes = tokio::fs::read(pdf_path).await?;
let filename = Path::new(pdf_path)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("document.pdf")
.to_owned();
let part = multipart::Part::bytes(bytes)
.file_name(filename)
.mime_str("application/pdf")?;
let form = multipart::Form::new().part("file", part);
let client = reqwest::Client::new();
let response = client
.post(format!("{PDFTRACT_URL}/extract"))
.multipart(form)
.timeout(std::time::Duration::from_secs(60))
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
return Err(format!("pdftract HTTP {status}: {body}").into());
}
let result: PdftractResult = response.json().await?;
Ok(result)
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let pdf_path = std::env::args()
.nth(1)
.ok_or("usage: program <file.pdf>")?;
let result = extract_http(&pdf_path).await?;
println!("{}", result.full_text());
if let Some(text) = result.page_text(1) {
println!("\n--- Page 1 ---");
println!("{text}");
}
Ok(())
}
7. Shell / Bash
When to prefer direct invocation: shell scripts, cron jobs, CI pipelines, or any context where you have direct access to the binary. When to prefer curl: when pdftract is running as a shared service on another host, inside a container, or when you want to avoid installing the binary locally.
Direct Invocation
#!/usr/bin/env bash
set -euo pipefail
PDF="${1:?Usage: $0 <file.pdf>}"
# --- JSON output ---
json=$(pdftract extract "$PDF")
# Full text via jq: collect all block text across all pages
full_text=$(echo "$json" | jq -r '[.pages[].blocks[].text] | join("\n")')
# Per-page text (page 1)
page1_text=$(echo "$json" | jq -r '.pages[] | select(.page == 1) | [.blocks[].text] | join("\n")')
# Metadata
title=$(echo "$json" | jq -r '.metadata.title // "(none)"')
pages=$(echo "$json" | jq -r '.metadata.page_count')
echo "Title : $title"
echo "Pages : $pages"
echo
echo "--- Full text ---"
echo "$full_text"
echo
echo "--- Page 1 ---"
echo "$page1_text"
# --- Plain text output (no jq needed) ---
plain=$(pdftract extract "$PDF" --text)
echo
echo "--- Plain text (--text flag) ---"
echo "$plain"
# --- Write JSON to file ---
pdftract extract "$PDF" --output "/tmp/$(basename "$PDF" .pdf).json"
echo "JSON written to /tmp/$(basename "$PDF" .pdf).json"
curl (HTTP)
#!/usr/bin/env bash
set -euo pipefail
PDF="${1:?Usage: $0 <file.pdf>}"
PDFTRACT_URL="${PDFTRACT_URL:-http://localhost:8080}"
# POST the PDF and capture the response; fail fast on HTTP errors.
json=$(curl --silent --show-error --fail \
--max-time 60 \
-F "file=@${PDF};type=application/pdf" \
"${PDFTRACT_URL}/extract")
# Full text via jq
full_text=$(echo "$json" | jq -r '[.pages[].blocks[].text] | join("\n")')
# Per-page text (page 1)
page1_text=$(echo "$json" | jq -r '.pages[] | select(.page == 1) | [.blocks[].text] | join("\n")')
# Metadata
title=$(echo "$json" | jq -r '.metadata.title // "(none)"')
pages=$(echo "$json" | jq -r '.metadata.page_count')
echo "Title : $title"
echo "Pages : $pages"
echo
echo "--- Full text ---"
echo "$full_text"
echo
echo "--- Page 1 ---"
echo "$page1_text"
# --- Save raw JSON ---
output_file="/tmp/$(basename "$PDF" .pdf).json"
echo "$json" > "$output_file"
echo "JSON saved to $output_file"
# --- Health check before submitting ---
# curl -sf "${PDFTRACT_URL}/health" > /dev/null \
# || { echo "pdftract serve is not running at ${PDFTRACT_URL}"; exit 1; }
Batch processing with xargs / parallel
#!/usr/bin/env bash
# Process every PDF in a directory, writing one JSON file per PDF.
# Uses GNU parallel if available, otherwise xargs -P.
PDF_DIR="${1:?Usage: $0 <dir>}"
OUT_DIR="${2:-/tmp/pdftract-out}"
mkdir -p "$OUT_DIR"
extract_one() {
local pdf="$1"
local out="$OUT_DIR/$(basename "$pdf" .pdf).json"
pdftract extract "$pdf" --output "$out" && echo "OK $pdf" || echo "ERR $pdf"
}
export -f extract_one
export OUT_DIR
find "$PDF_DIR" -name "*.pdf" -print0 \
| xargs -0 -P 4 -I{} bash -c 'extract_one "$@"' _ {}