From c2870e66404096c3ca741d1e48b767171fdb4ad2 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 16 May 2026 14:33:34 -0400 Subject: [PATCH] Add research docs and SDK invocation notes Four research documents covering PDF spec fundamentals, font types and encoding, glyph Unicode recovery, and tagged PDF structure/reading order. SDK invocation notes with subprocess and HTTP examples for Python, Node.js, Go, Ruby, Java, Rust, and Bash. Co-Authored-By: Claude Sonnet 4.6 --- docs/notes/sdk-invocation.md | 1100 +++++++++++++++++ .../glyph-recognition-and-unicode-recovery.md | 112 ++ docs/research/pdf-fonts-and-encoding.md | 300 +++++ docs/research/pdf-specification.md | 271 ++++ .../tagged-pdf-structure-and-reading-order.md | 253 ++++ 5 files changed, 2036 insertions(+) create mode 100644 docs/notes/sdk-invocation.md create mode 100644 docs/research/glyph-recognition-and-unicode-recovery.md create mode 100644 docs/research/pdf-fonts-and-encoding.md create mode 100644 docs/research/pdf-specification.md create mode 100644 docs/research/tagged-pdf-structure-and-reading-order.md diff --git a/docs/notes/sdk-invocation.md b/docs/notes/sdk-invocation.md new file mode 100644 index 0000000..c58cf7e --- /dev/null +++ b/docs/notes/sdk-invocation.md @@ -0,0 +1,1100 @@ +# pdftract SDK Invocation Guide + +How to invoke the `pdftract` binary from various languages, both via subprocess and via the HTTP server mode. + +## Binary Modes Reference + +``` +pdftract extract # JSON to stdout +pdftract extract --text # plain text to stdout +pdftract extract --output out.json # JSON to file +pdftract serve --port 8080 # HTTP server: POST /extract → JSON +``` + +## JSON Output Schema + +```json +{ + "pages": [ + { + "page": 1, + "spans": [ + { + "text": "Hello world", + "bbox": [x0, y0, x1, y1], + "font": "Helvetica", + "size": 12.0, + "confidence": 0.98 + } + ], + "blocks": [ + { + "kind": "paragraph", + "text": "Hello world", + "bbox": [x0, y0, x1, y1] + } + ] + } + ], + "metadata": { + "title": "...", + "author": "...", + "page_count": 10 + } +} +``` + +--- + +## 1. Python + +> **When to prefer subprocess:** one-off scripts, CLI pipelines, or when starting the server is not worth the overhead. +> **When to prefer HTTP:** long-running services, parallel extraction across many files, or when sharing a single pdftract instance across multiple workers. + +### Subprocess + +```python +import subprocess +import json +import sys + + +def extract_pdf_subprocess(pdf_path: str) -> dict: + """Extract text from a PDF via subprocess and return the parsed JSON result.""" + result = subprocess.run( + ["pdftract", "extract", pdf_path], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"pdftract failed (exit {result.returncode}): {result.stderr.strip()}" + ) + return json.loads(result.stdout) + + +def full_text(data: dict) -> str: + """Concatenate all block text across every page.""" + parts = [] + for page in data["pages"]: + for block in page["blocks"]: + parts.append(block["text"]) + return "\n".join(parts) + + +def page_text(data: dict, page_number: int) -> str: + """Return concatenated block text for a single page (1-indexed).""" + for page in data["pages"]: + if page["page"] == page_number: + return "\n".join(block["text"] for block in page["blocks"]) + raise ValueError(f"Page {page_number} not found") + + +if __name__ == "__main__": + pdf = sys.argv[1] + data = extract_pdf_subprocess(pdf) + + print(f"Title : {data['metadata'].get('title', '(none)')}") + print(f"Pages : {data['metadata']['page_count']}") + print() + print("--- Full text ---") + print(full_text(data)) + print() + print("--- Page 1 text ---") + print(page_text(data, 1)) +``` + +### HTTP (requests / httpx) + +```python +# pip install requests +# pip install httpx # async alternative shown below + +import requests +import json + + +PDFTRACT_URL = "http://localhost:8080" + + +def extract_pdf_http(pdf_path: str) -> dict: + """POST a PDF file to pdftract serve and return the parsed JSON result.""" + with open(pdf_path, "rb") as f: + response = requests.post( + f"{PDFTRACT_URL}/extract", + files={"file": (pdf_path, f, "application/pdf")}, + timeout=60, + ) + response.raise_for_status() + return response.json() + + +def full_text(data: dict) -> str: + parts = [] + for page in data["pages"]: + for block in page["blocks"]: + parts.append(block["text"]) + return "\n".join(parts) + + +def page_text(data: dict, page_number: int) -> str: + for page in data["pages"]: + if page["page"] == page_number: + return "\n".join(block["text"] for block in page["blocks"]) + raise ValueError(f"Page {page_number} not found") + + +# --- Async variant with httpx --- +import asyncio +import httpx + + +async def extract_pdf_async(pdf_path: str) -> dict: + async with httpx.AsyncClient(timeout=60) as client: + with open(pdf_path, "rb") as f: + response = await client.post( + f"{PDFTRACT_URL}/extract", + files={"file": (pdf_path, f, "application/pdf")}, + ) + response.raise_for_status() + return response.json() + + +if __name__ == "__main__": + import sys + + pdf = sys.argv[1] + + # Synchronous + data = extract_pdf_http(pdf) + print(full_text(data)) + + # Asynchronous + data = asyncio.run(extract_pdf_async(pdf)) + print(full_text(data)) +``` + +--- + +## 2. Node.js / JavaScript + +> **When to prefer subprocess:** build scripts, one-off tooling, or serverless functions where spinning up a child process is acceptable. +> **When to prefer HTTP:** Express/Fastify services, or when pdftract is deployed as a sidecar or shared microservice. + +### Subprocess (child_process) + +```js +// Node.js 18+ (ESM) +import { execFile } from "node:child_process"; +import { promisify } from "node:util"; + +const execFileAsync = promisify(execFile); + +/** + * Extract text from a PDF via subprocess. + * @param {string} pdfPath + * @returns {Promise} Parsed pdftract JSON + */ +async function extractPdfSubprocess(pdfPath) { + const { stdout, stderr } = await execFileAsync("pdftract", [ + "extract", + pdfPath, + ]).catch((err) => { + throw new Error(`pdftract failed (exit ${err.code}): ${err.stderr}`); + }); + + return JSON.parse(stdout); +} + +/** Concatenate all block text across every page. */ +function fullText(data) { + return data.pages + .flatMap((page) => page.blocks.map((b) => b.text)) + .join("\n"); +} + +/** Return concatenated block text for a single page (1-indexed). */ +function pageText(data, pageNumber) { + const page = data.pages.find((p) => p.page === pageNumber); + if (!page) throw new Error(`Page ${pageNumber} not found`); + return page.blocks.map((b) => b.text).join("\n"); +} + +// Usage +const data = await extractPdfSubprocess(process.argv[2]); +console.log("Title :", data.metadata.title ?? "(none)"); +console.log("Pages :", data.metadata.page_count); +console.log("\n--- Full text ---"); +console.log(fullText(data)); +console.log("\n--- Page 1 ---"); +console.log(pageText(data, 1)); +``` + +### HTTP (native fetch) + +```js +// Node.js 18+ — fetch is available globally; no extra dependencies required. +import { readFile } from "node:fs/promises"; + +const PDFTRACT_URL = "http://localhost:8080"; + +/** + * POST a PDF to pdftract serve. + * @param {string} pdfPath + * @returns {Promise} Parsed pdftract JSON + */ +async function extractPdfHttp(pdfPath) { + const bytes = await readFile(pdfPath); + const blob = new Blob([bytes], { type: "application/pdf" }); + + const form = new FormData(); + form.append("file", blob, pdfPath); + + const res = await fetch(`${PDFTRACT_URL}/extract`, { + method: "POST", + body: form, + }); + + if (!res.ok) { + const body = await res.text(); + throw new Error(`pdftract HTTP ${res.status}: ${body}`); + } + + return res.json(); +} + +function fullText(data) { + return data.pages + .flatMap((page) => page.blocks.map((b) => b.text)) + .join("\n"); +} + +function pageText(data, pageNumber) { + const page = data.pages.find((p) => p.page === pageNumber); + if (!page) throw new Error(`Page ${pageNumber} not found`); + return page.blocks.map((b) => b.text).join("\n"); +} + +// Usage +const data = await extractPdfHttp(process.argv[2]); +console.log(fullText(data)); +``` + +--- + +## 3. Go + +> **When to prefer subprocess:** CLI utilities or single-binary deployments where you want zero network overhead. +> **When to prefer HTTP:** Go services handling concurrent requests — spin up pdftract serve once and hit it from multiple goroutines. + +### Subprocess (os/exec) + +```go +package main + +import ( + "encoding/json" + "fmt" + "log" + "os" + "os/exec" + "strings" +) + +type Span struct { + Text string `json:"text"` + BBox [4]float64 `json:"bbox"` + Font string `json:"font"` + Size float64 `json:"size"` + Confidence float64 `json:"confidence"` +} + +type Block struct { + Kind string `json:"kind"` + Text string `json:"text"` + BBox [4]float64 `json:"bbox"` +} + +type Page struct { + Page int `json:"page"` + Spans []Span `json:"spans"` + Blocks []Block `json:"blocks"` +} + +type Metadata struct { + Title string `json:"title"` + Author string `json:"author"` + PageCount int `json:"page_count"` +} + +type PDFTractResult struct { + Pages []Page `json:"pages"` + Metadata Metadata `json:"metadata"` +} + +// extractSubprocess runs `pdftract extract ` and returns the parsed result. +func extractSubprocess(pdfPath string) (*PDFTractResult, error) { + out, err := exec.Command("pdftract", "extract", pdfPath).Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + return nil, fmt.Errorf("pdftract failed: %s", string(exitErr.Stderr)) + } + return nil, fmt.Errorf("exec error: %w", err) + } + + var result PDFTractResult + if err := json.Unmarshal(out, &result); err != nil { + return nil, fmt.Errorf("json parse error: %w", err) + } + return &result, nil +} + +// FullText concatenates all block text across every page. +func (r *PDFTractResult) FullText() string { + var sb strings.Builder + for _, page := range r.Pages { + for _, block := range page.Blocks { + sb.WriteString(block.Text) + sb.WriteByte('\n') + } + } + return sb.String() +} + +// PageText returns concatenated block text for a single page (1-indexed). +func (r *PDFTractResult) PageText(pageNumber int) (string, error) { + for _, page := range r.Pages { + if page.Page == pageNumber { + var sb strings.Builder + for _, block := range page.Blocks { + sb.WriteString(block.Text) + sb.WriteByte('\n') + } + return sb.String(), nil + } + } + return "", fmt.Errorf("page %d not found", pageNumber) +} + +func main() { + if len(os.Args) < 2 { + log.Fatal("usage: program ") + } + + result, err := extractSubprocess(os.Args[1]) + if err != nil { + log.Fatalf("extraction failed: %v", err) + } + + fmt.Printf("Title : %s\n", result.Metadata.Title) + fmt.Printf("Pages : %d\n", result.Metadata.PageCount) + fmt.Println("\n--- Full text ---") + fmt.Println(result.FullText()) + + p1, err := result.PageText(1) + if err != nil { + log.Printf("page 1: %v", err) + } else { + fmt.Println("--- Page 1 ---") + fmt.Println(p1) + } +} +``` + +### HTTP (net/http) + +```go +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "log" + "mime/multipart" + "net/http" + "os" + "path/filepath" +) + +const pdftractURL = "http://localhost:8080" + +// extractHTTP POSTs a PDF file to pdftract serve. +func extractHTTP(pdfPath string) (*PDFTractResult, error) { + f, err := os.Open(pdfPath) + if err != nil { + return nil, fmt.Errorf("open file: %w", err) + } + defer f.Close() + + var buf bytes.Buffer + mw := multipart.NewWriter(&buf) + + part, err := mw.CreateFormFile("file", filepath.Base(pdfPath)) + if err != nil { + return nil, fmt.Errorf("create form file: %w", err) + } + if _, err := io.Copy(part, f); err != nil { + return nil, fmt.Errorf("copy file: %w", err) + } + mw.Close() + + resp, err := http.Post( + pdftractURL+"/extract", + mw.FormDataContentType(), + &buf, + ) + if err != nil { + return nil, fmt.Errorf("http post: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("pdftract HTTP %d: %s", resp.StatusCode, body) + } + + var result PDFTractResult + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("json decode: %w", err) + } + return &result, nil +} + +func main() { + if len(os.Args) < 2 { + log.Fatal("usage: program ") + } + + result, err := extractHTTP(os.Args[1]) + if err != nil { + log.Fatalf("extraction failed: %v", err) + } + + fmt.Println(result.FullText()) +} +``` + +--- + +## 4. Ruby + +> **When to prefer subprocess:** Rake tasks, standalone scripts, or Rails background jobs without a persistent pdftract process. +> **When to prefer HTTP:** Sidekiq workers or Rails requests — keep pdftract serve running as a separate process and hit it over loopback. + +### Subprocess (Open3) + +```ruby +require "open3" +require "json" + +# Extract text from a PDF via subprocess. +# Returns a Hash parsed from pdftract's JSON output. +def extract_pdf_subprocess(pdf_path) + stdout, stderr, status = Open3.capture3("pdftract", "extract", pdf_path) + + unless status.success? + raise "pdftract failed (exit #{status.exitstatus}): #{stderr.strip}" + end + + JSON.parse(stdout) +end + +# Concatenate all block text across every page. +def full_text(data) + data["pages"] + .flat_map { |page| page["blocks"].map { |b| b["text"] } } + .join("\n") +end + +# Return concatenated block text for a single page (1-indexed). +def page_text(data, page_number) + page = data["pages"].find { |p| p["page"] == page_number } + raise "Page #{page_number} not found" unless page + + page["blocks"].map { |b| b["text"] }.join("\n") +end + +# Usage +pdf_path = ARGV[0] || raise("Usage: ruby extract.rb ") +data = extract_pdf_subprocess(pdf_path) + +puts "Title : #{data.dig("metadata", "title") || "(none)"}" +puts "Pages : #{data.dig("metadata", "page_count")}" +puts +puts "--- Full text ---" +puts full_text(data) +puts +puts "--- Page 1 ---" +puts page_text(data, 1) +``` + +### HTTP (net/http) + +```ruby +require "net/http" +require "json" + +PDFTRACT_URL = URI("http://localhost:8080/extract") + +# POST a PDF file to pdftract serve. +def extract_pdf_http(pdf_path) + boundary = "----pdftract#{rand(0xFFFFFF).to_s(16)}" + body = build_multipart(pdf_path, boundary) + + http = Net::HTTP.new(PDFTRACT_URL.host, PDFTRACT_URL.port) + http.read_timeout = 60 + + request = Net::HTTP::Post.new(PDFTRACT_URL.path) + request["Content-Type"] = "multipart/form-data; boundary=#{boundary}" + request.body = body + + response = http.request(request) + raise "pdftract HTTP #{response.code}: #{response.body}" unless response.is_a?(Net::HTTPSuccess) + + JSON.parse(response.body) +end + +def build_multipart(pdf_path, boundary) + crlf = "\r\n" + pdf_data = File.binread(pdf_path) + filename = File.basename(pdf_path) + + [ + "--#{boundary}#{crlf}", + "Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"#{crlf}", + "Content-Type: application/pdf#{crlf}", + crlf, + pdf_data, + "#{crlf}--#{boundary}--#{crlf}", + ].join +end + +def full_text(data) + data["pages"] + .flat_map { |page| page["blocks"].map { |b| b["text"] } } + .join("\n") +end + +def page_text(data, page_number) + page = data["pages"].find { |p| p["page"] == page_number } + raise "Page #{page_number} not found" unless page + + page["blocks"].map { |b| b["text"] }.join("\n") +end + +# Usage +pdf_path = ARGV[0] || raise("Usage: ruby extract_http.rb ") +data = extract_pdf_http(pdf_path) + +puts full_text(data) +``` + +--- + +## 5. Java + +> **When to prefer subprocess:** batch jobs or standalone utilities. ProcessBuilder is simple and avoids a network stack. +> **When to prefer HTTP:** Spring Boot services or multi-threaded apps — pdftract serve handles concurrent requests, while subprocess creates a new process per call. + +Requires Java 11+. No external dependencies — uses only the standard library. + +### Subprocess (ProcessBuilder) + +```java +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Invokes pdftract via subprocess and parses the JSON result. + * + * Dependency (Maven): + * + * com.fasterxml.jackson.core + * jackson-databind + * 2.17.0 + * + * + * If you prefer no dependencies, replace ObjectMapper with org.json or + * a manual string parse — the structure is straightforward. + */ +public class PdftractSubprocess { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + public static JsonNode extract(String pdfPath) throws IOException, InterruptedException { + ProcessBuilder pb = new ProcessBuilder("pdftract", "extract", pdfPath); + pb.redirectErrorStream(false); // keep stderr separate + Process process = pb.start(); + + byte[] stdout = process.getInputStream().readAllBytes(); + byte[] stderr = process.getErrorStream().readAllBytes(); + + int exit = process.waitFor(); + if (exit != 0) { + throw new IOException( + "pdftract failed (exit " + exit + "): " + new String(stderr).strip() + ); + } + + return MAPPER.readTree(stdout); + } + + /** Concatenate all block text across every page. */ + public static String fullText(JsonNode data) { + List parts = new ArrayList<>(); + for (JsonNode page : data.get("pages")) { + for (JsonNode block : page.get("blocks")) { + parts.add(block.get("text").asText()); + } + } + return String.join("\n", parts); + } + + /** Return concatenated block text for a single page (1-indexed). */ + public static String pageText(JsonNode data, int pageNumber) { + for (JsonNode page : data.get("pages")) { + if (page.get("page").asInt() == pageNumber) { + List parts = new ArrayList<>(); + for (JsonNode block : page.get("blocks")) { + parts.add(block.get("text").asText()); + } + return String.join("\n", parts); + } + } + throw new IllegalArgumentException("Page " + pageNumber + " not found"); + } + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + System.err.println("Usage: PdftractSubprocess "); + System.exit(1); + } + + JsonNode data = extract(args[0]); + + JsonNode meta = data.get("metadata"); + System.out.println("Title : " + meta.path("title").asText("(none)")); + System.out.println("Pages : " + meta.get("page_count").asInt()); + System.out.println("\n--- Full text ---"); + System.out.println(fullText(data)); + System.out.println("\n--- Page 1 ---"); + System.out.println(pageText(data, 1)); + } +} +``` + +### HTTP (java.net.http.HttpClient, Java 11+) + +```java +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +public class PdftractHttp { + + private static final String PDFTRACT_URL = "http://localhost:8080"; + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final HttpClient CLIENT = HttpClient.newBuilder() + .connectTimeout(Duration.ofSeconds(10)) + .build(); + + public static JsonNode extract(String pdfPath) throws IOException, InterruptedException { + Path path = Path.of(pdfPath); + byte[] pdfBytes = Files.readAllBytes(path); + String filename = path.getFileName().toString(); + String boundary = UUID.randomUUID().toString().replace("-", ""); + + // Build multipart/form-data body manually (no external library needed) + String crlf = "\r\n"; + byte[] partHeader = ( + "--" + boundary + crlf + + "Content-Disposition: form-data; name=\"file\"; filename=\"" + filename + "\"" + crlf + + "Content-Type: application/pdf" + crlf + + crlf + ).getBytes(); + byte[] partFooter = (crlf + "--" + boundary + "--" + crlf).getBytes(); + + byte[] body = new byte[partHeader.length + pdfBytes.length + partFooter.length]; + System.arraycopy(partHeader, 0, body, 0, partHeader.length); + System.arraycopy(pdfBytes, 0, body, partHeader.length, pdfBytes.length); + System.arraycopy(partFooter, 0, body, partHeader.length + pdfBytes.length, partFooter.length); + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(PDFTRACT_URL + "/extract")) + .timeout(Duration.ofSeconds(60)) + .header("Content-Type", "multipart/form-data; boundary=" + boundary) + .POST(HttpRequest.BodyPublishers.ofByteArray(body)) + .build(); + + HttpResponse response = CLIENT.send( + request, HttpResponse.BodyHandlers.ofString() + ); + + if (response.statusCode() != 200) { + throw new IOException( + "pdftract HTTP " + response.statusCode() + ": " + response.body() + ); + } + + return MAPPER.readTree(response.body()); + } + + public static String fullText(JsonNode data) { + List parts = new ArrayList<>(); + for (JsonNode page : data.get("pages")) { + for (JsonNode block : page.get("blocks")) { + parts.add(block.get("text").asText()); + } + } + return String.join("\n", parts); + } + + public static String pageText(JsonNode data, int pageNumber) { + for (JsonNode page : data.get("pages")) { + if (page.get("page").asInt() == pageNumber) { + List parts = new ArrayList<>(); + for (JsonNode block : page.get("blocks")) { + parts.add(block.get("text").asText()); + } + return String.join("\n", parts); + } + } + throw new IllegalArgumentException("Page " + pageNumber + " not found"); + } + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + System.err.println("Usage: PdftractHttp "); + System.exit(1); + } + + JsonNode data = extract(args[0]); + System.out.println(fullText(data)); + } +} +``` + +--- + +## 6. Rust + +> **When to prefer subprocess:** CLI tools or single-threaded batch processors — zero extra dependencies beyond `serde_json`. +> **When to prefer HTTP:** Async Tokio services — `reqwest` is non-blocking and naturally fits async Rust workloads. + +### Subprocess (std::process::Command) + +Add to `Cargo.toml`: +```toml +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +``` + +```rust +use serde::Deserialize; +use std::process::Command; + +#[derive(Debug, Deserialize)] +struct Span { + pub text: String, + pub bbox: [f64; 4], + pub font: String, + pub size: f64, + pub confidence: f64, +} + +#[derive(Debug, Deserialize)] +struct Block { + pub kind: String, + pub text: String, + pub bbox: [f64; 4], +} + +#[derive(Debug, Deserialize)] +struct Page { + pub page: u32, + pub spans: Vec, + pub blocks: Vec, +} + +#[derive(Debug, Deserialize)] +struct Metadata { + pub title: Option, + pub author: Option, + pub page_count: u32, +} + +#[derive(Debug, Deserialize)] +struct PdftractResult { + pub pages: Vec, + pub metadata: Metadata, +} + +impl PdftractResult { + /// Concatenate all block text across every page. + pub fn full_text(&self) -> String { + self.pages + .iter() + .flat_map(|p| p.blocks.iter().map(|b| b.text.as_str())) + .collect::>() + .join("\n") + } + + /// Return concatenated block text for a single page (1-indexed). + pub fn page_text(&self, page_number: u32) -> Option { + self.pages + .iter() + .find(|p| p.page == page_number) + .map(|p| { + p.blocks + .iter() + .map(|b| b.text.as_str()) + .collect::>() + .join("\n") + }) + } +} + +fn extract_subprocess(pdf_path: &str) -> Result> { + let output = Command::new("pdftract") + .args(["extract", pdf_path]) + .output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!( + "pdftract failed (exit {:?}): {}", + output.status.code(), + stderr.trim() + ) + .into()); + } + + let result: PdftractResult = serde_json::from_slice(&output.stdout)?; + Ok(result) +} + +fn main() -> Result<(), Box> { + let pdf_path = std::env::args() + .nth(1) + .ok_or("usage: program ")?; + + let result = extract_subprocess(&pdf_path)?; + + println!("Title : {}", result.metadata.title.as_deref().unwrap_or("(none)")); + println!("Pages : {}", result.metadata.page_count); + println!("\n--- Full text ---"); + println!("{}", result.full_text()); + + if let Some(text) = result.page_text(1) { + println!("\n--- Page 1 ---"); + println!("{text}"); + } + + Ok(()) +} +``` + +### HTTP (reqwest) + +Add to `Cargo.toml`: +```toml +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = "1" +reqwest = { version = "0.12", features = ["multipart"] } +tokio = { version = "1", features = ["full"] } +``` + +```rust +use reqwest::multipart; +use serde::Deserialize; +use std::path::Path; + +// Re-use the same structs from the subprocess example above. +// (PdftractResult, Page, Block, Span, Metadata — copy them in) + +const PDFTRACT_URL: &str = "http://localhost:8080"; + +async fn extract_http(pdf_path: &str) -> Result> { + let bytes = tokio::fs::read(pdf_path).await?; + let filename = Path::new(pdf_path) + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("document.pdf") + .to_owned(); + + let part = multipart::Part::bytes(bytes) + .file_name(filename) + .mime_str("application/pdf")?; + + let form = multipart::Form::new().part("file", part); + + let client = reqwest::Client::new(); + let response = client + .post(format!("{PDFTRACT_URL}/extract")) + .multipart(form) + .timeout(std::time::Duration::from_secs(60)) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(format!("pdftract HTTP {status}: {body}").into()); + } + + let result: PdftractResult = response.json().await?; + Ok(result) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let pdf_path = std::env::args() + .nth(1) + .ok_or("usage: program ")?; + + let result = extract_http(&pdf_path).await?; + + println!("{}", result.full_text()); + + if let Some(text) = result.page_text(1) { + println!("\n--- Page 1 ---"); + println!("{text}"); + } + + Ok(()) +} +``` + +--- + +## 7. Shell / Bash + +> **When to prefer direct invocation:** shell scripts, cron jobs, CI pipelines, or any context where you have direct access to the binary. +> **When to prefer curl:** when pdftract is running as a shared service on another host, inside a container, or when you want to avoid installing the binary locally. + +### Direct Invocation + +```bash +#!/usr/bin/env bash +set -euo pipefail + +PDF="${1:?Usage: $0 }" + +# --- JSON output --- +json=$(pdftract extract "$PDF") + +# Full text via jq: collect all block text across all pages +full_text=$(echo "$json" | jq -r '[.pages[].blocks[].text] | join("\n")') + +# Per-page text (page 1) +page1_text=$(echo "$json" | jq -r '.pages[] | select(.page == 1) | [.blocks[].text] | join("\n")') + +# Metadata +title=$(echo "$json" | jq -r '.metadata.title // "(none)"') +pages=$(echo "$json" | jq -r '.metadata.page_count') + +echo "Title : $title" +echo "Pages : $pages" +echo +echo "--- Full text ---" +echo "$full_text" +echo +echo "--- Page 1 ---" +echo "$page1_text" + +# --- Plain text output (no jq needed) --- +plain=$(pdftract extract "$PDF" --text) +echo +echo "--- Plain text (--text flag) ---" +echo "$plain" + +# --- Write JSON to file --- +pdftract extract "$PDF" --output "/tmp/$(basename "$PDF" .pdf).json" +echo "JSON written to /tmp/$(basename "$PDF" .pdf).json" +``` + +### curl (HTTP) + +```bash +#!/usr/bin/env bash +set -euo pipefail + +PDF="${1:?Usage: $0 }" +PDFTRACT_URL="${PDFTRACT_URL:-http://localhost:8080}" + +# POST the PDF and capture the response; fail fast on HTTP errors. +json=$(curl --silent --show-error --fail \ + --max-time 60 \ + -F "file=@${PDF};type=application/pdf" \ + "${PDFTRACT_URL}/extract") + +# Full text via jq +full_text=$(echo "$json" | jq -r '[.pages[].blocks[].text] | join("\n")') + +# Per-page text (page 1) +page1_text=$(echo "$json" | jq -r '.pages[] | select(.page == 1) | [.blocks[].text] | join("\n")') + +# Metadata +title=$(echo "$json" | jq -r '.metadata.title // "(none)"') +pages=$(echo "$json" | jq -r '.metadata.page_count') + +echo "Title : $title" +echo "Pages : $pages" +echo +echo "--- Full text ---" +echo "$full_text" +echo +echo "--- Page 1 ---" +echo "$page1_text" + +# --- Save raw JSON --- +output_file="/tmp/$(basename "$PDF" .pdf).json" +echo "$json" > "$output_file" +echo "JSON saved to $output_file" + +# --- Health check before submitting --- +# curl -sf "${PDFTRACT_URL}/health" > /dev/null \ +# || { echo "pdftract serve is not running at ${PDFTRACT_URL}"; exit 1; } +``` + +### Batch processing with xargs / parallel + +```bash +#!/usr/bin/env bash +# Process every PDF in a directory, writing one JSON file per PDF. +# Uses GNU parallel if available, otherwise xargs -P. + +PDF_DIR="${1:?Usage: $0 }" +OUT_DIR="${2:-/tmp/pdftract-out}" +mkdir -p "$OUT_DIR" + +extract_one() { + local pdf="$1" + local out="$OUT_DIR/$(basename "$pdf" .pdf).json" + pdftract extract "$pdf" --output "$out" && echo "OK $pdf" || echo "ERR $pdf" +} +export -f extract_one +export OUT_DIR + +find "$PDF_DIR" -name "*.pdf" -print0 \ + | xargs -0 -P 4 -I{} bash -c 'extract_one "$@"' _ {} +``` diff --git a/docs/research/glyph-recognition-and-unicode-recovery.md b/docs/research/glyph-recognition-and-unicode-recovery.md new file mode 100644 index 0000000..2f5619c --- /dev/null +++ b/docs/research/glyph-recognition-and-unicode-recovery.md @@ -0,0 +1,112 @@ +# Glyph Recognition and Unicode Recovery in PDF Text Extraction + +## Overview + +PDF text extraction depends on the font's encoding machinery to map raw glyph identifiers — character codes in a content stream — to Unicode codepoints. When that machinery is absent, broken, or intentionally obscured, a robust extractor must fall back through a layered series of heuristics. This document surveys the failure modes and recovery strategies a Rust engineer needs to understand when building `pdftract`. + +--- + +## 1. Why CMaps Fail + +A ToUnicode CMap is an optional but critical PDF object that maps each glyph's character code to one or more Unicode codepoints. Its absence or incorrectness is a frequent source of garbled extraction output. + +**Custom encoding without ToUnicode.** Type 1 and TrueType fonts embedded in PDF can use a custom Encoding dictionary that remaps character codes arbitrarily. If no ToUnicode CMap is present, the only remaining signal is the glyph name — and only if the author did not rename glyphs. Many print-production workflows strip ToUnicode entries during PDF/X conversion to reduce file size. + +**Type 3 fonts with arbitrary glyph procedures.** A Type 3 font defines each glyph as a sequence of PDF content stream operators. There is no standardized shape; the glyph procedure could draw anything, including decorative symbols, logos, or redacted characters. The font's Encoding maps codes to glyph names, but those names are arbitrary strings chosen by the document author. + +**Scanned PDFs with fake text layers.** OCR pipelines sometimes embed a hidden Type 3 or Type 1 font whose glyphs are designed to be invisible at normal rendering, purely to carry searchable text. The ToUnicode CMap may be correct but carry OCR errors, or may be present only for a subset of characters. In pathological cases the text layer and visual content are deliberately misaligned (common in forms with print-and-sign workflows). + +**Symbol fonts repurposed for body text.** ZapfDingbats, Symbol, and similar fonts have standard glyph shapes that encode mathematical or decorative characters. Documents that route body-text characters through these fonts — especially via PDF/A compliance workarounds or legacy WordPerfect exports — will produce garbled output when a consumer naively interprets character codes as Latin. + +**Intentionally obfuscated PDFs (DRM).** Some DRM schemes replace ToUnicode CMaps with shuffled or encrypted equivalents. The content stream references glyph codes whose ToUnicode entries map to decoy codepoints, while the real text requires a key or rendering to recover. Detecting this is an open problem; the best practical heuristic is low-confidence scoring on known-word frequency after extraction. + +**Authoring tool bugs.** Adobe InDesign, Microsoft Word, and LibreOffice all have historically shipped versions that generated incorrect ToUnicode CMaps — most commonly for ligatures (fi, fl, ff), for characters outside Basic Latin, and for fonts using expert-set or OldStyle figure variants. The ToUnicode entry may be structurally valid (parseable) but semantically wrong, mapping the fi ligature to U+0066 U+0069 in one range definition and to U+FB01 in another, with the wrong range selected at runtime. + +--- + +## 2. Glyph Name Heuristics + +When ToUnicode is absent, the font's Encoding dictionary may still provide glyph names — strings like `A`, `comma`, `fi`, `uni0041`, `u1D400`. The Adobe Glyph List (AGL) 2.0 and its companion specification define an algorithm to extract Unicode codepoints from these names. + +**The AGL algorithm (abbreviated):** + +1. If the name is in the AGL table (a ~4000-entry mapping from name to codepoint), return the mapped codepoint. +2. If the name is of the form `uniXXXX` (exactly four uppercase hex digits), return U+XXXX. Multiple consecutive `uniXXXX` segments encode a sequence (ligatures or decomposed characters). +3. If the name is of the form `uXXXXXX` (four to six uppercase hex digits), return U+XXXXXX, provided the codepoint is in a valid Unicode range (not a surrogate, not above U+10FFFF). +4. If the name contains a period (`.`), strip the suffix and reapply the algorithm to the base name. The suffix is a variant tag and carries no Unicode meaning. +5. Otherwise, the name is unrecognized; return REPLACEMENT CHARACTER or signal failure. + +The full AGL table is published by Adobe at `https://github.com/adobe-type-tools/agl-aglfn`. The `aglfn` variant (Adobe Glyph List for New Fonts) is the normative source for production use — it includes only names that unambiguously map to a single codepoint. The broader AGL includes legacy names with complex decompositions. + +**ZapfDingbats and Symbol.** These fonts are explicitly carved out of the AGL algorithm. The PDF specification (ISO 32000-2, section 9.10.2) mandates a separate glyph-name-to-Unicode mapping for each. Symbol uses an encoding close to ISO Latin-1 for printable ASCII, then maps higher bytes to Greek letters and mathematical operators via a font-specific table. ZapfDingbats maps character codes 33–254 to a defined set of Unicode dingbat and geometric shape codepoints. Both tables are small (< 300 entries) and should be hardcoded; attempting to apply AGL to them produces wrong results. + +--- + +## 3. Font Fingerprinting Approaches + +When glyph names are absent or unhelpful, characteristics of the font itself may identify it. + +**FontDescriptor metrics.** Every embedded font should include a FontDescriptor dictionary with numeric metrics: `Ascent`, `Descent`, `CapHeight`, `XHeight`, `StemV`, `StemH`, `ItalicAngle`, and a `FontBBox` rectangle. These values are not unique enough alone, but they prune the candidate space significantly. A font with CapHeight 716 and XHeight 523 in a 1000-unit em square is almost certainly Times New Roman Regular or a metric-equivalent clone. Combining four or five metrics gives a coarse but useful fingerprint. + +**Checksum and hash matching.** Embedded TrueType and OpenType fonts contain a `checkSumAdjustment` field in the `head` table. More reliably, the raw bytes of the `cmap`, `glyf`, or `CFF ` table can be hashed (SHA-256) and looked up in a pre-built database of known fonts. This is the most precise fingerprinting strategy; the challenge is building and maintaining the database. Google Fonts, Adobe Fonts, and the web safe fonts cover the majority of PDFs encountered in practice. + +**PostScript name matching.** The `FontName` in the FontDescriptor and `BaseFont` in the font dictionary are PostScript names (e.g., `TimesNewRomanPSMT`, `ArialMT`, `HelveticaNeue-Bold`). These frequently identify the font family and style without metric lookup. Normalize by stripping common suffixes (`-Regular`, `-MT`, `PS`, `LT`), folding to lowercase, and removing whitespace before matching against a known-font table. False positives are common (many fonts claim to be "Helvetica"), so use name matching only to select a candidate, then confirm with metrics. + +--- + +## 4. Glyph Outline Analysis + +If a font is embedded with full outline data, glyph shapes can serve as fingerprints against Unicode character databases, without full raster OCR. + +**Type 1 charstrings.** A Type 1 charstring encodes a glyph's Bezier outline as a compact stack-based bytecode. Parsing charstrings yields a sequence of moveto/lineto/curveto operations. Normalize the resulting path: translate to origin, scale to unit square, and compute a fixed-size feature vector (e.g., a grid of orientation histograms, or moment invariants). Compare against pre-computed vectors for every Unicode character in candidate fonts. + +**TrueType glyph programs.** TrueType stores outlines in the `glyf` table as contour sequences with on-curve and off-curve control points. The same normalization-and-comparison approach applies. One practical simplification: rasterize the normalized outline to a small bitmap (e.g., 32×32 grayscale) and compute a perceptual hash (pHash or dHash). This loses some precision but is fast and storage-efficient for the reference database. + +**Approximate shape matching tradeoffs.** Vector-based outline matching is accurate for clean outlines but degrades with variation in design weight, optical size, or deliberate distortion. It cannot handle Type 3 fonts where the glyph procedure uses fill rules or clip paths that the Bezier extraction misses. Full raster OCR (e.g., Tesseract on a rasterized glyph image) is more robust but orders of magnitude slower and introduces an external binary dependency. The recommended middle ground is outline matching as a fast first pass, falling back to OCR only for glyphs where outline matching confidence is below a threshold. + +--- + +## 5. Context-Based Recovery + +When a document is mostly well-decoded, poorly decoded characters can be inferred from context. + +**Statistical character prediction.** Character n-gram models trained on text corpora assign probabilities to candidate codepoints given surrounding decoded characters. For a position where extraction fails, score each candidate against the n-gram model. This is most useful for single-glyph substitutions in otherwise Latin text (e.g., a missing `e` in English). + +**Dictionary-based gap filling.** If a word contains one or two unknown characters and the surrounding characters form a near-match to a dictionary entry, the dictionary entry is a candidate. Restrict to the same script as the surrounding characters. Edit distance (Levenshtein with wildcards for unknown positions) is the standard metric. This works well for ligatures: an unknown glyph between `o` and `e` in an English word is almost certainly `ff` or `fi`. + +**Language model scoring.** A word-level or subword language model can rescore candidates from the above methods. For `pdftract`, integrating a full LM is heavy; a practical approximation is a ranked word-list with bigram statistics. The Norvig frequency list or Zipf-weighted lists from Wikipedia work well for English; CLDR/BabelNet equivalents exist for other scripts. + +--- + +## 6. Practical Recovery Pipeline + +The recommended priority order for `pdftract` is: + +### Step 1: ToUnicode CMap +Parse the ToUnicode stream, validate that it is a well-formed CMap (check `begincmap`/`endcmap`, `beginbfchar`/`endbfchar`, `beginbfrange`/`endbfrange` blocks). Apply the mapping. Flag any character codes that fall outside the mapped ranges as unresolved. If the CMap maps a code to U+FFFD or U+0000, treat those mappings as missing rather than authoritative. + +### Step 2: Glyph Name via AGL +For each unresolved code, retrieve its glyph name from the font's Encoding dictionary. Apply the AGL algorithm in order: direct AGL table lookup, `uniXXXX` expansion, `uXXXXXX` expansion, period-stripped base name retry. Apply the ZapfDingbats or Symbol override table if the font is identified by name as one of those two. Assign the resulting codepoint with high confidence. + +### Step 3: Font Name Fingerprinting +For glyphs still unresolved, normalize the `BaseFont` / `FontName` strings and look up in a known-font database. If matched, use the font's standard encoding for the matched font (e.g., look up the character code in the font's standard cmap). Validate against FontDescriptor metrics if present. If the font is a known metric-equivalent, retrieve its standard glyph-to-Unicode mapping. Assign the result with medium confidence and tag for downstream review. + +### Step 4: Outline Shape Matching +For glyphs where steps 1–3 failed or produced low-confidence results, extract the glyph outline from the font program (Type 1 charstring parser or TrueType `glyf` reader). Normalize and compute the shape fingerprint. Query a pre-built reference database of Unicode character outlines. Return the top-k candidates with similarity scores. Select the highest-scoring candidate above a threshold (empirically ~0.85 cosine similarity on moment-invariant vectors). Below the threshold, mark as unresolved and defer to step 5. + +### Step 5: OCR Fallback +As a last resort, rasterize the unresolved glyph at a sufficient resolution (>= 150 DPI equivalent on the normalized em square, typically 32–64px) and pass it to a character-level OCR recognizer. Tesseract's single-character mode or a custom CNN trained on Unicode character images are both viable. OCR introduces latency and an external dependency, so it should be gated on a configuration flag and applied only when no other step has produced a confident result. + +**Cross-step confidence aggregation.** Assign each step a base confidence tier (Step 1: 0.95, Step 2: 0.90, Step 3: 0.70, Step 4: 0.60–0.90, Step 5: 0.50–0.85). After the pipeline, apply context-based rescoring (Section 5) to candidates below 0.80 confidence, using the surrounding high-confidence characters as context. Expose the final confidence score and the recovery step taken as metadata on each extracted character, so callers can choose to suppress or highlight uncertain output. + +--- + +## References + +- ISO 32000-2:2020 (PDF 2.0), Section 9 (Text) and Annex D (Character Sets and Encodings) +- Adobe Glyph List Specification, version 1.7 — `adobe-type-tools/agl-specification` +- Adobe Glyph List for New Fonts (aglfn) — `adobe-type-tools/agl-aglfn` +- Adobe Type 1 Font Format specification (Black Book), Chapter 6 (Charstrings) +- Apple TrueType Reference Manual — `glyf` table specification +- OpenType Specification 1.9, Microsoft Typography (CFF / CFF2 charstring formats) +- Unicode Standard Annex #29 (Unicode Text Segmentation) — relevant for ligature decomposition diff --git a/docs/research/pdf-fonts-and-encoding.md b/docs/research/pdf-fonts-and-encoding.md new file mode 100644 index 0000000..5f6d397 --- /dev/null +++ b/docs/research/pdf-fonts-and-encoding.md @@ -0,0 +1,300 @@ +# PDF Fonts and Encoding: Technical Reference for Text Extraction + +This document describes every font type found in PDF files, how character codes are decoded to Unicode, and the data structures a Rust extraction engine must interpret. References are to the PDF 1.7 specification (ISO 32000-1:2008) and Adobe technical notes where applicable. + +--- + +## 1. Font Types + +### 1.1 Type 1 (Simple Font) + +Type 1 fonts originate from the Adobe Type 1 format, stored as PFB (binary) or PFA (ASCII) font programs. In a PDF the font dictionary has `/Subtype /Type1`. + +**Glyph storage.** The font program is a PostScript charstring program. When embedded, it appears under `/FontDescriptor` as the stream value of `/FontFile` (Type 1 binary). The charstrings are keyed by glyph name, not by a numeric glyph ID. + +**Character code interpretation.** A one-byte character code from the content stream is mapped through the font's `/Encoding` to a glyph name, then the glyph name is looked up in the charstring dictionary. See §3 for encoding details. + +**Widths.** The `/Widths` array (required) contains `LastChar - FirstChar + 1` entries, each giving the horizontal advance width in text-space units (1/1000 em). `/FirstChar` and `/LastChar` define the range. Codes outside this range use `/MissingWidth` from the font descriptor. + +**Standard 14 fonts.** PDF readers must implement the 14 standard Type 1 fonts (Helvetica, Times-Roman, Courier, Symbol, ZapfDingbats, and their variants) without an embedded font program. These are never embedded; the reader synthesizes metrics. + +### 1.2 Type 3 (Simple Font) + +`/Subtype /Type3`. Glyphs are defined as PDF content streams directly in the font dictionary under `/CharProcs`, a dictionary from glyph name to stream. There is no external font program. + +**Character code interpretation.** One-byte code → glyph name via `/Encoding` → content stream in `/CharProcs`. Because glyph names are arbitrary (user-defined), there is often no reliable path to Unicode without a `/ToUnicode` CMap. If `/ToUnicode` is absent, extraction must fall back to glyph name heuristics or report the text as unresolvable. + +**Widths.** `/Widths`, `/FirstChar`, `/LastChar` as in Type 1. Additionally, `/FontMatrix` transforms glyph-space coordinates; the default for Type 1 is `[0.001 0 0 0.001 0 0]`, but Type 3 fonts frequently use `[1 0 0 1 0 0]` with glyph streams drawn at full size. + +### 1.3 TrueType (Simple Font) + +`/Subtype /TrueType`. The embedded program is a TrueType font binary under `/FontFile2` in the font descriptor. + +**Glyph storage.** Glyphs are stored by integer glyph ID (GID) inside the `glyf` table. The `cmap` table maps Unicode codepoints (or platform-specific codes) to GIDs. + +**Character code interpretation.** One-byte code → glyph name via `/Encoding` → GID via the font's `cmap`. When the encoding is a standard PDF encoding (WinAnsiEncoding, MacRomanEncoding, etc.), the implementation maps code → Unicode codepoint → GID using `cmap` platform/encoding subtable (platform 3, encoding 1: Windows Unicode BMP). If the font's `cmap` contains only platform 1 (Macintosh), platform-specific code mappings apply. This is a common source of extraction errors. + +**Widths.** Same `/Widths` array mechanism as Type 1. The `hmtx` TrueType table provides the authoritative advance widths; the PDF `/Widths` array should match but may differ in broken documents. + +### 1.4 Type 0 (Composite Font) + +`/Subtype /Type0`. This is the container for multi-byte (CJK and other large character set) text. The font dictionary has: + +- `/Encoding` — a CMap name (e.g., `Identity-H`) or a stream containing a CMap program. +- `/DescendantFonts` — a one-element array holding a CIDFont dictionary. + +**Character code interpretation.** The multi-byte content stream codes are fed through the CMap named in `/Encoding`, which maps character codes to CIDs. The CIDFont then maps CIDs to GIDs. See §4. + +**Widths.** Widths are specified in the CIDFont descendant, not in the Type 0 dictionary itself. + +### 1.5 CIDFont Type 0 (CFF-Based) + +`/Subtype /CIDFontType0` inside a `/DescendantFonts` array. The font program is a CFF (Compact Font Format, also called Type 2 charstrings) font embedded under `/FontFile3` with `/Subtype /CIDFontType0C` or `/Subtype /OpenType`. + +**Glyph storage.** CFF stores charstrings keyed by GID (integer index). GIDs map directly to charstrings; glyph names may or may not be present depending on the CFF variant. + +**Widths.** The CIDFont dictionary uses `/DW` (default width, default 1000) and `/W` (array of per-CID widths). The `/W` syntax is: an array whose elements alternate between `c [w1 w2 ...]` (individual CIDs) and `c1 c2 w` (range with uniform width). + +### 1.6 CIDFont Type 2 (TrueType-Based) + +`/Subtype /CIDFontType2`. The embedded program is a TrueType or OpenType/TT font under `/FontFile2` (TrueType) or `/FontFile3` with `/Subtype /OpenType`. + +**CID-to-GID mapping.** The `/CIDToGIDMap` entry in the CIDFont dictionary is critical: +- If the value is the name `/Identity`, CID equals GID directly (CID = GID). +- Otherwise it is a stream of 2×65536 bytes: the GID for CID `n` is the 16-bit big-endian value at byte offset `2n`. + +**Widths.** Same `/DW` and `/W` mechanism as CIDFont Type 0. + +### 1.7 OpenType in PDF + +OpenType fonts are embedded as `/FontFile3` streams with `/Subtype /OpenType`. An OpenType font may contain either CFF outlines (`CFF` table present → CIDFont Type 0) or TrueType outlines (`glyf` table present → CIDFont Type 2). The handling follows the respective CIDFont rules. The PDF spec does not treat OpenType as a separate subtype; it is identified by the stream subtype. + +--- + +## 2. Encoding Mechanisms + +### 2.1 Predefined Encodings + +The PDF spec defines four named encodings for simple fonts (§D.1–D.4, PDF 1.7): + +| Name | Character set | Typical use | +|------|--------------|-------------| +| `StandardEncoding` | 229 glyphs from the Adobe standard | Default for Type 1 fonts that omit `/Encoding` | +| `MacRomanEncoding` | Mac OS Roman 256 code points | Older Mac-generated PDFs | +| `WinAnsiEncoding` | Windows-1252 (cp1252) | Windows-generated PDFs; most common | +| `MacExpertEncoding` | Expert font character set (fractions, small caps) | Rare; expert-set fonts | + +`PDFDocEncoding` is a PDF-internal encoding used for text strings in the document catalog (info dictionary, annotations) but **not** for font encoding; it must not be confused with font encodings. It extends Latin-1 by filling 0x18–0x1F and 0x80–0x9F with additional characters. + +`Symbol` and `ZapfDingbats` fonts use built-in symbol encodings defined in the respective AFM files. They do **not** use the standard named encodings; their code-to-glyph mapping is private and must be looked up against the font-specific tables provided in PDF Annex D. + +### 2.2 The `/Encoding` Dictionary and `/Differences` Array + +When a font's `/Encoding` value is a dictionary rather than a name, the dictionary may contain: + +- `/Type /Encoding` (optional) +- `/BaseEncoding` — a name (`StandardEncoding`, `MacRomanEncoding`, `WinAnsiEncoding`) designating the starting table. If absent, the base depends on font type (Type 1 defaults to built-in; others to StandardEncoding). +- `/Differences` — an array of the form `[code name code name ...]` or `[code name name name ...]`. Starting from the numeric code, each following name overrides successive slots. Example: `[32 /space /exclam /quotedbl]` overrides slots 32, 33, 34. + +Encoding resolution algorithm for simple fonts: +1. Start from the BaseEncoding table. +2. Apply each `/Differences` entry, replacing the glyph name at the given code position. +3. Resolve each resulting glyph name to Unicode via the Adobe Glyph List (§5). + +### 2.3 Symbol and ZapfDingbats + +These two standard fonts carry the `Symbolic` flag (bit 3 of `/Flags` in the font descriptor). Their encoding is defined entirely by the glyph names in the font program; the predefined named encodings do not apply. Extraction must use the AGL or the font's own encoding vector. ZapfDingbats glyph names are documented in the PDF spec Annex D.6. + +--- + +## 3. ToUnicode CMaps + +### 3.1 CMap Stream Format + +A ToUnicode CMap is a PostScript-inspired stream embedded directly in the PDF. The structure (PDF §9.10.3): + +``` +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo 3 dict dup begin + /Registry (Adobe) def + /Ordering (UCS) def + /Supplement 0 def +end def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +4 beginbfchar +<0041> <0041> % code 0x41 → U+0041 (A) +<00A0> <00A0> + % code 0xF001 → U+FB01 (fi ligature) + % code 0xF002 → U+FB02 (fl ligature) +endbfchar +1 beginbfrange +<0061> <007A> <0061> % codes 0x61–0x7A → U+0061–U+007A (a–z) +endbfrange +endcmap +CMapName currentdict /CMap defineresource pop +end +end +``` + +**`beginbfchar` / `endbfchar`:** Each entry is a pair ` `. The destination is UTF-16BE hex bytes; a surrogate pair encodes a codepoint above U+FFFF. + +**`beginbfrange` / `endbfrange`:** Range ` ` maps a contiguous code range to a contiguous Unicode range. Alternatively, ` [ ...]` maps each code in the range to the corresponding Unicode string in the array. + +**`begincidrange` / `endcidrange`:** Used in Type 0 CMaps (not ToUnicode) to map codes to CID ranges; see §4. + +### 3.2 Embedding in PDF + +The ToUnicode CMap appears as the value of the `/ToUnicode` key in the font dictionary (both simple and composite fonts). It is a stream object, usually with `/Filter /FlateDecode`. + +### 3.3 When ToUnicode is Absent or Wrong + +**Absent:** Extraction must fall back to encoding → glyph name → AGL lookup (simple fonts) or CID-to-Unicode tables derived from the predefined CMap ordering (composite fonts). Many PDFs produced by older tools (TeX-based pipelines, some CAD exporters) omit `/ToUnicode`; the AGL fallback is the only reliable option. + +**Wrong or incomplete:** Some generators emit a `/ToUnicode` CMap with missing entries or incorrect mappings. A bfchar entry with destination `<0000>` or `` signals an intentionally unmapped glyph. An implementation should not blindly trust all mappings; NUL and replacement-character destinations should be treated as absent. + +**Implications for extraction:** Without a `/ToUnicode` map, ligature glyphs (`fi`, `fl`, `ffi`, etc.) will be decoded as their AGL expansions (multi-character strings), which is usually correct. Private Use Area (PUA) codepoints require a `/ToUnicode` map to resolve; without one the extracted text should preserve the PUA codepoint but flag it as unresolved. + +--- + +## 4. CID-to-GID Mapping (Composite Fonts) + +### 4.1 Decoding Path + +For a Type 0 composite font, the decoding pipeline is: + +``` +content-stream bytes + → CMap (named in /Encoding) + → CID + → GID (via CIDToGIDMap or CFF index) + → glyph outline +``` + +The `/Encoding` CMap converts multi-byte character codes (1–4 bytes) to CIDs. The CMap may be: +- A name referring to a predefined CMap (see §4.2). +- A stream object containing a CMap program. + +### 4.2 Predefined CMaps + +Adobe distributes predefined CMaps for CJK encodings (PDF Annex M). Key examples: + +| Name | Script | Code space | Notes | +|------|--------|-----------|-------| +| `Identity-H` | any (horizontal) | 2-byte | CID = code (identity) | +| `Identity-V` | any (vertical) | 2-byte | CID = code, vertical writing | +| `90ms-RKSJ-H` | Japanese | Shift-JIS | Maps SJIS codes → Adobe-Japan1 CIDs | +| `GBK-EUC-H` | Simplified Chinese | GBK/EUC | Maps GBK → Adobe-GB1 CIDs | +| `UniGB-UTF16-H` | Simplified Chinese | UTF-16BE | Unicode input → Adobe-GB1 CIDs | +| `UniJIS-UTF16-H` | Japanese | UTF-16BE | Unicode input → Adobe-Japan1 CIDs | + +For `Identity-H`/`Identity-V`, the CID equals the raw 2-byte code value, and if `/CIDToGIDMap /Identity`, the GID equals the CID. These are the simplest cases for TrueType-based CIDFonts. + +### 4.3 CIDSystemInfo + +Every CIDFont and its associated CMap must declare `/CIDSystemInfo`, a dictionary with `/Registry` (string), `/Ordering` (string), and `/Supplement` (integer). This identifies the CID character collection, e.g., Adobe-Japan1-6. The CIDFont and its CMap must share the same Registry and Ordering. Implementations should use this to select fallback Unicode tables when `/ToUnicode` is absent (Adobe publishes CID→Unicode mappings for its standard collections). + +--- + +## 5. Glyph Name to Unicode (Adobe Glyph List) + +### 5.1 The AGL + +The Adobe Glyph List (AGL, `aglfn.txt`, version 1.7) maps glyph names to Unicode scalar values. An implementation should embed the AGL as a static hash table (approximately 4,000 entries). + +**Algorithmic fallback** (AGL specification §2): If a glyph name is not in the AGL table: +1. Strip any trailing `.` (e.g., `A.sc` → `A`). +2. If the name starts with `uni`, parse the following hex digits as UTF-16BE codepoint(s): `uni0041` → U+0041. +3. If the name starts with `u`, parse the following hex as a Unicode scalar: `u1F600` → U+1F600. +4. If none of the above, the glyph is unmapped. + +**Ligatures.** `fi` → U+FB01, `fl` → U+FB02, `ffi` → U+FB03, `ffl` → U+FB04. These are single AGL entries mapping to single Unicode codepoints. Many extraction engines prefer to expand ligatures to their component characters (fi → "fi") for searchability; this is a policy choice, not a spec requirement. + +**`.notdef`.** The glyph named `.notdef` is the fallback glyph for unmapped codes. It has no Unicode mapping. Extractors should silently skip or emit U+FFFD for `.notdef`. + +**`afii` names.** Legacy glyph names starting with `afii` (e.g., `afii57506`) appear in older Arabic and Hebrew fonts. The AGL maps these to their correct Unicode codepoints; no special handling beyond AGL lookup is needed. + +--- + +## 6. Font Descriptors + +The `/FontDescriptor` dictionary (§9.8, PDF 1.7) is referenced by the font dictionary via `/FontDescriptor`. It provides metrics and the embedded font binary. + +### 6.1 Key Entries + +| Key | Type | Description | +|-----|------|-------------| +| `/FontName` | name | PostScript name of the font | +| `/FontBBox` | rectangle | Glyph bounding box in glyph-space units | +| `/Flags` | integer | Bitfield describing font characteristics | +| `/ItalicAngle` | number | Dominant italic angle in degrees | +| `/Ascent` | number | Maximum ascent above baseline | +| `/Descent` | number | Maximum descent below baseline (negative) | +| `/CapHeight` | number | Height of capital letters | +| `/XHeight` | number | Height of lowercase letters | +| `/StemV` | number | Dominant vertical stem width | +| `/FontFile` | stream | Type 1 PFB data | +| `/FontFile2` | stream | TrueType binary | +| `/FontFile3` | stream | CFF, OpenType, or CIDFontType0C binary (identified by stream `/Subtype`) | + +### 6.2 Flags Bitfield + +The `/Flags` integer is a 32-bit field; bits are numbered from 1 (LSB). Key bits: + +| Bit | Mask | Meaning | +|-----|------|---------| +| 1 | 0x0001 | FixedPitch | +| 2 | 0x0002 | Serif | +| 3 | 0x0004 | Symbolic — font uses a private encoding; standard encodings do not apply | +| 4 | 0x0008 | Script (cursive) | +| 6 | 0x0020 | Nonsymbolic — font uses a standard Latin encoding | +| 7 | 0x0040 | Italic | +| 17 | 0x10000 | AllCap | +| 18 | 0x20000 | SmallCap | +| 19 | 0x40000 | ForceBold | + +The `Symbolic` (bit 3) and `Nonsymbolic` (bit 6) flags are mutually exclusive and affect encoding resolution: a symbolic font's encoding is its own built-in table; a nonsymbolic font follows the standard named encoding fallback rules. + +### 6.3 Inferring Unicode When CMap Data Is Absent + +When both `/ToUnicode` and a useful `/Encoding` are missing, the following heuristics apply, in order: +1. If the embedded font is TrueType (`/FontFile2`) and the `/Flags` `Nonsymbolic` bit is set, use the font's `cmap` table with the `WinAnsiEncoding` assumption (platform 3, encoding 1). +2. If the font is CFF (`/FontFile3` with `/Subtype /CIDFontType0C`), the CFF `charset` table may supply glyph names; apply AGL. +3. If `/FontName` identifies a known standard font (e.g., `Symbol`, `ZapfDingbats`), apply the font-specific encoding table from PDF Annex D. +4. Otherwise, emit PUA codepoints or U+FFFD and flag the text as requiring post-processing. + +The font descriptor `/FontBBox` and `/Flags` provide no path to Unicode; they are useful only for layout heuristics (detecting whitespace, line boundaries) when Unicode resolution fails. + +--- + +## Appendix: Key Dictionary Locations + +``` +/Font dictionary + /Subtype → Type1 | Type3 | TrueType | Type0 | CIDFontType0 | CIDFontType2 + /Encoding → name or dictionary (simple); CMap name or stream (Type0) + /ToUnicode → stream (CMap program) + /FontDescriptor → dictionary + /Flags → integer (bitfield) + /FontFile → stream (Type 1) + /FontFile2 → stream (TrueType) + /FontFile3 → stream (CFF/OpenType; /Subtype in stream dict) + /Widths → array (simple fonts) + /FirstChar → integer + /LastChar → integer + /DescendantFonts → array [ CIDFont dict ] (Type0 only) + +CIDFont dictionary (inside /DescendantFonts) + /Subtype → CIDFontType0 | CIDFontType2 + /CIDSystemInfo → dict (/Registry /Ordering /Supplement) + /DW → integer (default advance width) + /W → array (per-CID widths) + /CIDToGIDMap → /Identity or stream (CIDFontType2 only) + /FontDescriptor → dictionary (as above) +``` + +--- + +*Spec references: ISO 32000-1:2008 §9 (Fonts), §D (Character Sets), §M (Predefined CMaps); Adobe Glyph List Specification v1.7; Adobe Type 1 Font Format (Black Book); Adobe CMap and CIDFont Files Specification v1.0.* diff --git a/docs/research/pdf-specification.md b/docs/research/pdf-specification.md new file mode 100644 index 0000000..2678616 --- /dev/null +++ b/docs/research/pdf-specification.md @@ -0,0 +1,271 @@ +# PDF Specification — Text Extraction Reference + +**Scope:** ISO 32000-1:2008 (PDF 1.7) and ISO 32000-2:2020 (PDF 2.0), with version deltas noted. +This document is implementation-focused reference material for building a PDF text extraction parser in Rust. + +--- + +## 1. File Structure + +### 1.1 Body, Cross-Reference Table, and Trailer + +A PDF file is divided into four logical sections: header, body, cross-reference table (xref), and trailer. The header contains `%PDF-x.y` on line 1; if the file contains binary data, a comment with four bytes ≥ 128 (e.g., `%âãÏÓ`) follows on line 2 to signal binary content to transport agents. + +The **body** is a sequence of indirect objects. Each object is identified by an object number and generation number: `obj_num gen_num obj ... endobj`. The generation number is 0 for all non-reused objects; it increments to 1 when the slot is freed and reallocated. + +The **traditional xref table** (PDF 1.0+) begins with the keyword `xref`, followed by one or more subsections. Each subsection starts with `start_obj_num count`, then `count` 20-byte entries: `nnnnnnnnnn ggggg n\r\n` (offset, generation, `n`=in-use / `f`=free). The final entry in a free list is always generation 65535. Offsets are byte offsets from the start of the file. + +The **trailer dictionary** immediately follows the xref table (`trailer` keyword, then the dict). Mandatory keys: + +- `/Size` — total number of indirect object slots (one more than the highest object number) +- `/Root` — indirect reference to the document catalog +- `/Prev` — byte offset of the previous xref table (for incremental updates) +- `/Encrypt` — encryption dictionary (if encrypted) +- `/Info` — metadata dictionary (deprecated in PDF 2.0 in favor of XMP) +- `/ID` — two-element array of 16-byte MD5 strings; required if `/Encrypt` is present + +The file ends with `%%EOF`, preceded by `startxref` and the byte offset of the most recent xref table (or xref stream). + +### 1.2 Xref Streams (PDF 1.5+) + +In PDF 1.5+, the xref table may be replaced by an **xref stream**, which is an ordinary indirect stream object whose dictionary combines the xref subsection metadata with the trailer dictionary. Key fields: + +- `/Type /XRef` +- `/W [w1 w2 w3]` — widths in bytes of the three fields per entry (type, field2, field3) +- `/Index [start count ...]` — subsection descriptors (default: `[0 /Size]`) +- `/Size`, `/Root`, `/Prev`, `/Encrypt`, `/ID` — same semantics as traditional trailer + +Entry types (first field of each row): +- `0` — free object (field2 = next free obj num, field3 = generation) +- `1` — in-use uncompressed object (field2 = byte offset, field3 = generation) +- `2` — compressed object in object stream (field2 = object stream obj num, field3 = index within the stream) + +All numeric fields are big-endian unsigned integers. `/W` entries of 0 mean the field is implicitly 0 (useful for stripping the type byte when all entries are type 1). + +### 1.3 Object Streams (PDF 1.5+) + +An **object stream** (`/Type /ObjStm`) packs multiple indirect objects into a single compressed stream. The stream dictionary includes: + +- `/N` — number of compressed objects +- `/First` — byte offset within the decoded stream where the first object body begins +- `/Extends` — reference to an earlier object stream this one augments + +The first `/First` bytes of the decoded stream are pairs `obj_num offset`, giving the position of each object body relative to the `/First` offset. Object streams may not contain stream objects or objects with generation numbers other than 0. They are referenced via type-2 xref entries. + +### 1.4 Linearized PDFs + +Linearized ("fast web view") PDFs reorganize the byte layout so that the first page can be rendered before the full file is downloaded. The linearization dictionary appears as the first object: `/Linearized 1.0`, `/L` (file length), `/H` (hint stream offsets), `/O` (first-page object number), `/E` (end of first-page section offset), `/N` (page count), `/T` (offset of main xref table). Parsers must handle the two-xref-table layout (one at the beginning for the first page, one at the end for the rest of the document). + +### 1.5 Incremental Updates + +PDF supports appending updates without rewriting the original body. Each update appends: new/modified object definitions, a new xref table or xref stream (with `/Prev` pointing to the previous one), and a new `startxref`/`%%EOF`. Parsers must start from the last `startxref`, build the authoritative object table by walking `/Prev` chains, and let later definitions override earlier ones for the same object number. + +--- + +## 2. Page Content Streams + +### 2.1 Content Stream Mechanics + +Each page dictionary (`/Type /Page`) contains a `/Contents` key that references either a single stream or an array of streams. Multiple streams are concatenated with a single space between them before parsing; they share a single graphics state. The stream body is a sequence of operands followed by an operator keyword — postfix notation. + +Streams are filtered via `/Filter` (e.g., `/FlateDecode`, `/LZWDecode`, `/ASCII85Decode`). Multiple filters are applied in array order. The unfiltered stream body must be parsed as PDF syntax. + +### 2.2 Graphics State + +The graphics state is a stack-based structure. `q` pushes a copy; `Q` pops. Relevant entries for text extraction: + +- **CTM** — Current Transformation Matrix (see Section 3) +- **Clipping path** — not needed for text extraction but must be tracked for completeness +- Text state (see Section 2.3) is part of the graphics state and is reset to defaults at `q`/`Q` boundaries + +### 2.3 Text State Parameters + +| Parameter | Set by | Default | Description | +|-----------|--------|---------|-------------| +| `/Font` + size | `Tf` | (none, required) | Current font and size | +| `Tc` | `Tc` | 0 | Character spacing (unscaled text units) | +| `Tw` | `Tw` | 0 | Word spacing (unscaled text units) | +| `Th` (Tz) | `Tz` | 100 | Horizontal scaling (%) | +| `Tl` | `TL` | 0 | Text leading | +| `Tmode` | `Tr` | 0 | Text rendering mode | +| `Trise` | `Ts` | 0 | Text rise | +| `Tm` | `Tm`, `Td`, `TD`, `T*` | identity/reset per BT | Text matrix | +| `Tlm` | `Td`, `TD`, `T*` | identity | Text line matrix | + +Text state parameters persist across `q`/`Q` only for the _non-matrix_ parameters in PDF < 1.7; in ISO 32000-1 §9.3.1, text state is part of the graphics state and is saved/restored with `q`/`Q`. In practice, parsers should save the entire text state on `q`. + +### 2.4 Text Object Operators + +Text objects are delimited by `BT` (Begin Text) and `ET` (End Text). `BT` resets `Tm` and `Tlm` to the identity matrix; it does not reset other text state parameters. `ET` terminates the text object. + +**Text positioning operators:** + +- `tx ty Td` — move to start of next line: `Tlm = Tlm × [[1,0,0],[0,1,0],[tx,ty,1]]`; also sets `Tm = Tlm` +- `tx ty TD` — equivalent to `-ty TL; tx ty Td` (sets leading and moves) +- `T*` — equivalent to `0 -Tl Td` +- `a b c d e f Tm` — set `Tm` and `Tlm` directly to the matrix `[[a,b,0],[c,d,0],[e,f,1]]`; does not concatenate with CTM + +**Text showing operators:** + +- `(string) Tj` — show string; advance `Tm` by glyph widths +- `[(string|num)...] TJ` — show array; numeric elements adjust horizontal position by `-n/1000 × Tfs × Th/100` text units (negative values = kern tighter) +- `(string) '` — equivalent to `T*; (string) Tj` +- `aw ac (string) "` — equivalent to `aw Tw; ac Tc; (string) '` + +--- + +## 3. Coordinate Systems and Text Position + +### 3.1 Spaces + +- **Device space** — physical output device pixels +- **User space** — default is 1 unit = 1/72 inch at 72 DPI +- **Text space** — defined by `Tm` concatenated with CTM: `Tspace_to_device = Tm × CTM` +- **Glyph space** — defined by the font; for Type 1 fonts, 1000 units = 1 unit in text space scaled by `Tfs` + +### 3.2 CTM + +The CTM maps user space to device space. It is initialized from the page's `/MediaBox` and any `/Rotate` entry. `cm` concatenates a new matrix: `CTM_new = matrix × CTM_old`. The CTM is a 6-element affine transform `[a b c d e f]` representing: + +``` +| a b 0 | +| c d 0 | +| e f 1 | +``` + +Point transformation: `[x' y' 1] = [x y 1] × M`. + +### 3.3 Text Matrix Update per Glyph + +After rendering each glyph with width `w` (in glyph units, normalized by dividing by 1000), the text matrix advances: + +``` +tx = (w/1000 × Tfs + Tc + (is_space ? Tw : 0)) × Th/100 +Tm = [[1,0,0],[0,1,0],[tx,0,1]] × Tm +``` + +For vertical writing mode, the advance is in the y direction with analogous computation using `/DW2` and `/W2`. The text matrix `Tm` is a local variable within the text object; it is not preserved across `BT`/`ET`. + +--- + +## 4. Character Spacing, Word Spacing, and Horizontal Scaling + +- **`Tc` (character spacing):** Added to the advance width of every glyph, including space. Units are unscaled text space units (before Tz scaling). Applied after glyph advance, before the next glyph. +- **`Tw` (word spacing):** Added to the advance width only for single-byte character code 0x20 (ASCII space). In multi-byte encodings and Type0/CIDFont fonts, word spacing applies only if the character code is exactly the single byte `0x20`; it does not apply to multi-byte space characters. Units are unscaled text space units. +- **`Tz` (horizontal scaling, operator `Tz`):** Scales the horizontal component of all glyph displacements and character/word spacing. Value is a percentage (100 = no scaling). Formally: `tx_scaled = tx × Tz/100`. This is applied before converting to user space. + +The combined advance formula per glyph (horizontal writing): + +``` +advance = (w0/1000 × Tfs + Tc + Tw_if_space) × Tz/100 +``` + +where `w0` is the glyph's horizontal width from the font (possibly overridden by `/Widths` array in the font dict) and `Tfs` is the font size from `Tf`. + +--- + +## 5. Marked Content + +### 5.1 Operators + +Marked content allows semantic annotation of content stream regions: + +- `tag BMC` — Begin Marked Content with tag name only +- `tag props BDC` — Begin Marked Content with property dictionary (inline dict or name referencing `/Properties` in the page/resource dict) +- `EMC` — End Marked Content (matching pop) +- `tag MP` — Marked Content Point (no extent) +- `tag props DP` — Marked Content Point with properties + +Operators nest: each `BMC`/`BDC` must be matched by an `EMC`. They can span across content streams in a page's `/Contents` array only if the array forms a single logical stream (per ISO 32000-1 §14.6, marked content sequences must not span stream boundaries). + +### 5.2 Tagged PDF (PDF 1.3+) + +The document catalog's `/MarkInfo` dictionary signals tagged PDF: `/Marked true`. Tagged PDFs have a **structure tree** rooted at `/StructTreeRoot` in the catalog. Structure elements (SEs) are dictionaries with: + +- `/S` — structure type (e.g., `/P`, `/H1`, `/Span`, `/Table`, `/TR`, `/TD`) +- `/K` — children: structure elements or **marked content references** (MCRs) +- `/Pg` — page reference +- `/P` — parent SE + +An MCR references content stream marked content via `/Type /MCR`, `/MCID` (integer), and optionally `/Pg`. The `/MCID` matches the integer in `BDC` property dicts (`/MCID n`). The `/ParentTree` in `/StructTreeRoot` is a number tree mapping MCIDs to their parent structure elements, enabling reverse lookup from content stream to structure tree. + +For text extraction preserving logical reading order, parse the structure tree top-down and use MCID mappings to retrieve text spans from the content stream — rather than extracting text in paint order from the stream directly. This is essential for multi-column layouts and reflow-capable documents. + +### 5.3 ActualText and Alt + +Structure elements and marked content property dicts may carry `/ActualText` (a UTF-16BE or UTF-8 string providing the Unicode text that the content renders, overriding glyph-level decoding) and `/Alt` (alternate description, for accessibility). For extraction, `/ActualText` in a BDC property dict or on a structure element takes precedence over decoded glyph text within the marked region. + +--- + +## 6. PDF Version Deltas Relevant to Text Extraction + +### PDF 1.2 (Acrobat 3) +Introduced CMaps for CIDFont encoding and ToUnicode CMaps. The `/ToUnicode` stream in a font dictionary maps character codes to Unicode codepoints (using `beginbfchar`/`endbfchar`/`beginbfrange`/`endbfrange` operators in the CMap syntax). Without `/ToUnicode`, extraction must fall back to font encoding and glyph name heuristics. + +### PDF 1.3 (Acrobat 4) +Introduced **Tagged PDF**: `/MarkInfo`, `/StructTreeRoot`, marked content operators (`BMC`/`BDC`/`EMC`), and the `/ActualText` attribute. Also added `/TT` (TrueType) and Type0 composite fonts as first-class. Digital signatures added. + +### PDF 1.4 (Acrobat 5) +Transparency model (`/Group` with `/S /Transparency`), soft masks. Does not directly affect text extraction mechanics, but transparency groups introduce nested content streams (the group's `/Contents` stream) that must be recursed. + +### PDF 1.5 (Acrobat 6) +Object streams (`/ObjStm`) and cross-reference streams (`/XRef`). **Critical for parsing**: the traditional xref table may be absent entirely. Also introduced JBIG2 and JPEG2000 image filters. Optional content groups (OCGs, `/OCProperties`) can mark content as conditionally visible — text inside an invisible OCG should typically be excluded from extraction or flagged. + +### PDF 1.6 (Acrobat 7) +AES encryption (RC4 still supported). No structural changes to text representation. + +### PDF 1.7 (Acrobat 8) / ISO 32000-1:2008 +Codified as an ISO standard. Added `/Extensions` dictionary in the catalog for third-party extensions. Formalized the full specification of all features up to this version. `/ToUnicode` CMaps are now the authoritative mechanism for character-to-Unicode mapping. + +### PDF 2.0 / ISO 32000-2:2020 +- `/Info` metadata dictionary deprecated (XMP stream in `/Metadata` is now the sole authoritative metadata source) +- `/EncryptMetadata` behavior clarified +- New encryption algorithms (AES-256 with revised key derivation) +- Removed several deprecated features (Type1C embedded font variants as standalone, LZWDecode with early-change=0 for new files) +- `/ActualText` semantics clarified: if present, the full Unicode string it provides replaces all glyph-level text decoding for that span +- Structure namespace concept added (structure types are now namespace-qualified) +- Unambiguous specification that `/ToUnicode` CMap entries for ligatures (e.g., `fi` ligature → `fi`) use `beginbfchar` with a multi-character destination string + +--- + +## 7. Font Encoding and ToUnicode + +Font dicts (`/Type /Font`) encode characters via: + +1. **`/Encoding`** — simple fonts only; either a name (`/WinAnsiEncoding`, `/MacRomanEncoding`, `/StandardEncoding`, `/PDFDocEncoding`) or a diff array (`/Differences`) +2. **`/ToUnicode`** — CMap stream present in all well-formed tagged PDFs; maps `` character codes to `` Unicode codepoints in UTF-16BE +3. **Type0 (composite) fonts** — use `/Encoding /Identity-H` or a CMap name; character codes are 1 or 2 bytes; glyph IDs via the `/DescendantFonts` array → CIDFont → `/CIDToGIDMap` +4. **Fallback** — glyph name → Adobe Glyph List (AGL) lookup; last resort for fonts lacking `/ToUnicode` + +The CMap stream grammar relevant to `/ToUnicode`: +- `begincodespacerange` / `endcodespacerange` — defines valid code ranges +- `beginbfchar n` / `endbfchar` — n mappings of ` ` +- `beginbfrange n` / `endbfrange` — range mappings: ` ` (sequential) or ` [str0 str1 ...]` (array form) + +Destination strings are UTF-16BE byte sequences. A single src code mapping to a multi-char destination (e.g., ligature) is valid and must be handled. + +--- + +## 8. Key Dictionary Keys Summary + +| Dict | Key | Type | Notes | +|------|-----|------|-------| +| Page | `/Contents` | stream or array | Content streams | +| Page | `/Resources` | dict | Fonts, XObjects, Properties, ColorSpaces | +| Resources | `/Font` | dict | Font name → font dict | +| Resources | `/Properties` | dict | Tag name → property dict (for BDC) | +| Font | `/Type /Font` | name | Always `/Font` | +| Font | `/Subtype` | name | `/Type1`, `/TrueType`, `/Type0`, `/Type3`, `/CIDFontType0`, `/CIDFontType2` | +| Font | `/BaseFont` | name | PostScript font name | +| Font | `/Encoding` | name or dict | Character encoding | +| Font | `/ToUnicode` | stream | CMap stream for Unicode mapping | +| Font | `/Widths` | array | Glyph widths for simple fonts | +| Font | `/FirstChar`, `/LastChar` | integer | Range for `/Widths` | +| Type0 Font | `/DescendantFonts` | array | One-element array of CIDFont dict | +| CIDFont | `/DW` | integer | Default width (default: 1000) | +| CIDFont | `/W` | array | Individual/range width overrides | +| Catalog | `/MarkInfo` | dict | `/Marked true` for tagged PDF | +| Catalog | `/StructTreeRoot` | dict | Structure tree root | +| StructTreeRoot | `/ParentTree` | number tree | MCID → structure element | +| ObjStm | `/N` | integer | Number of compressed objects | +| ObjStm | `/First` | integer | Offset of first object body | diff --git a/docs/research/tagged-pdf-structure-and-reading-order.md b/docs/research/tagged-pdf-structure-and-reading-order.md new file mode 100644 index 0000000..6104c63 --- /dev/null +++ b/docs/research/tagged-pdf-structure-and-reading-order.md @@ -0,0 +1,253 @@ +# Tagged PDF Structure and Reading Order + +## 1. Tagged PDF Overview + +A "tagged" PDF is one that carries a logical structure tree alongside the visual content stream. The structure tree expresses the document's semantic organization — headings, paragraphs, lists, tables — independently of the glyph positions on the page. Tagging is declared in the document catalog via the **MarkInfo** dictionary (ISO 32000-2 §14.7.2): + +``` +MarkInfo << /Marked true >> +``` + +`/Marked true` asserts that every piece of real content is covered by a marked-content sequence and that a StructTreeRoot exists. Two companion keys are `/UserProperties` (Boolean, whether structure attributes carry user-defined properties) and `/Suspects` (Boolean; `true` warns that the tagging may be unreliable). + +### Standards that mandate tagging + +- **PDF/UA-1 (ISO 14289-1:2014)** — full tagging mandatory; every real content item must be a tagged marked-content sequence or an artifact; reading order must be encoded in the structure tree; ActualText, Alt, and Lang must be present where applicable. +- **PDF/UA-2 (ISO 14289-2, based on PDF 2.0)** — same accessibility intent, tightened to the PDF 2.0 object model. +- **PDF/A-1a (ISO 19005-1)** — Level A conformance requires tagging; Level B does not. +- **PDF/A-2a / PDF/A-3a** — Level A conformance of their respective parts likewise requires tagging. + +### Authoring tool quality + +| Tool | Tagging quality | +|---|---| +| Adobe InDesign (Articles panel configured) | High — structure order matches Articles panel order; artifact marking reliable | +| Microsoft Word (Save As PDF, modern versions) | Moderate — headings and paragraphs tagged; tables usually correct; images sometimes lack Alt; complex layouts may produce wrong reading order | +| LibreOffice Writer (Export as PDF/UA) | Moderate-to-good since 7.x; improving but table tagging sometimes produces extra empty cells | +| Adobe Acrobat Accessibility Checker / Make Accessible | Variable — uses heuristics on existing layout; common source of misordered structure trees | +| LaTeX (tagpdf package + pdflatex/lualatex) | Improving; math tagging still maturing | +| Programmatic (iText, Apache PDFBox, reportlab) | Depends on developer; often minimal tagging | + +--- + +## 2. Structure Tree + +The structure tree is rooted at the **StructTreeRoot** object, referenced from the document catalog as `/StructTreeRoot`. Key entries on StructTreeRoot (ISO 32000-2 §14.7.4): + +| Key | Type | Meaning | +|---|---|---| +| `K` | array or dict | Immediate children (StructElem objects) | +| `IDTree` | name tree | Maps element IDs to StructElem objects | +| `ParentTree` | number tree | Maps MCID integers to the StructElem that contains them (see §3) | +| `ParentTreeNextKey` | integer | Next available key for ParentTree | +| `RoleMap` | dictionary | Maps non-standard structure types to standard ones | +| `ClassMap` | dictionary | Named attribute class definitions | + +### StructElem dictionary + +Each logical element is a StructElem dictionary (§14.7.5): + +| Key | Type | Meaning | +|---|---|---| +| `S` | name | Structure type (required) | +| `P` | indirect ref | Parent StructElem or StructTreeRoot (required) | +| `K` | various | Kids: a StructElem, MCID integer, marked-content ref dict, object ref dict, or array of these | +| `ID` | byte string | Optional unique identifier | +| `Pg` | indirect ref | Page object where the element's content lives (can be inherited) | +| `A` | dict or array | Attributes | +| `C` | name or array | Attribute class names | +| `R` | integer | Revision number | +| `T` | text string | Title | +| `Lang` | text string | BCP 47 language tag (overrides document-level Lang) | +| `Alt` | text string | Alternative text (figures, formulas) | +| `ActualText` | text string | Overrides extracted glyphs for this element | +| `E` | text string | Expansion of an abbreviation | + +### Standard structure types (ISO 32000-2 §14.8.4) + +**Grouping elements** (contain other elements, no direct content): +`Document`, `DocumentFragment`, `Part`, `Sect`, `Div`, `Aside`, `NonStruct`, `Private`, `TOC`, `TOCI` + +**Block-level elements**: +`P` (paragraph), `H` (generic heading), `H1`–`H6` (leveled headings), `Title`, `FENote`, `Sub` + +**List elements**: +`L` (list), `LI` (list item), `Lbl` (label/bullet), `LBody` (body of item) + +**Table elements**: +`Table`, `TR` (row), `TH` (header cell), `TD` (data cell), `THead`, `TBody`, `TFoot` + +**Inline elements**: +`Span`, `Em`, `Strong`, `Link`, `Annot`, `Form`, `Ruby`, `RB`, `RT`, `RP`, `Warichu`, `WT`, `WP` + +**Illustration/media**: +`Figure`, `Formula`, `Caption` + +The `RoleMap` on StructTreeRoot maps non-standard type names to the nearest standard type, allowing extraction code to normalize custom types from authoring tools without special-casing each tool. + +--- + +## 3. Marked Content and MCIDs + +In a content stream, a **marked-content sequence** wraps real content between `BMC`/`BDC` and `EMC` operators. Tagged content uses `BDC` with a property list that includes `/MCID`: + +``` +/P <> BDC + BT ... (text operators) ... ET +EMC +``` + +The MCID is a non-negative integer unique within the page's content stream (and XObject content streams referenced from that page). MCIDs are the bridge from visual content back to the structure tree. + +### Parent tree + +The **ParentTree** (a number tree on StructTreeRoot) maps: +- For standard tagged content: `MCID → StructElem` that directly contains it +- For XObject content streams: the key is the XObject's structural parent key + +To walk from a rendered text run to its logical element: + +1. Identify the page object. +2. Find the MCID from the `BDC` property list in the content stream. +3. Look up the page's **StructParents** integer in the page dictionary. +4. Index into ParentTree at `StructParents` to find the array of parent StructElems for that page. The array index is the MCID itself, giving the direct parent StructElem. +5. Walk up via `P` links to the root to reconstruct ancestry. + +A StructElem's `K` array can contain: +- An integer (MCID on the same page as the element's `Pg`) +- A **marked-content reference dictionary** `< /MCID n>>` — used when content is on a different page than the element's own `Pg` (multi-page elements such as a paragraph split across pages) +- An **object reference dictionary** `<>>` — links to an annotation or XObject + +Multi-page StructElems (e.g., a `Table` spanning two pages) use MCR dicts with explicit `/Pg` entries for each page, so extraction must collect MCIDs page-by-page and union them under the single logical element. + +--- + +## 4. Reading Order from Structure + +The order of children in a StructElem's `K` array encodes **logical reading order** independent of glyph x/y coordinates. A conforming extractor should: + +1. Walk the structure tree depth-first in `K` array order. +2. At each leaf MCID, retrieve the text from the corresponding marked-content sequence. +3. Concatenate text in tree traversal order, inserting whitespace at element boundaries per the element type (block elements → newline; inline elements → preserve or single space as context demands). + +### Common misordering problems + +Auto-taggers (Acrobat's "Make Accessible", Google Docs export) frequently produce structure trees whose element order mirrors the **content stream order** rather than true reading order. In multi-column PDFs, this can interleave column 1 and column 2 text at the paragraph level. + +Detection heuristics: +- Compare bounding boxes of consecutive sibling `P` elements: if the x-origin of element n+1 is dramatically less than that of element n while y-coordinates have not advanced, the two elements are probably in separate columns and the tree order is suspect. +- Check `/Suspects true` in MarkInfo — this is the authoring tool's own admission of uncertainty. +- Count `P` elements whose bounding boxes overlap horizontally but are separated vertically by more than one line-height; a high count signals column-interleaved tagging. + +When misordering is detected, fall back to spatial reading-order reconstruction (§5) while still using the structure tree for semantic type labeling (headings, lists, tables). + +--- + +## 5. Reading Order Without Structure + +When `/Marked` is absent or `false`, or when `/Suspects true` is set and validation fails, reading order must be inferred from glyph geometry. + +### Spatial preprocessing + +Collect all text objects from the content stream with their bounding boxes (derived from the text matrix, font metrics, and glyph widths). Group glyphs into **text runs** sharing the same font, size, and baseline, then cluster runs into **lines** by overlapping y-ranges within a vertical tolerance (~0.5× line height). + +### Column detection: x-gap analysis + +Project all text bounding-box x-extents onto the x-axis. Find gaps (ranges of x with no text) that span at least ~90% of the page height. Each gap boundary is a candidate **column gutter**. Sort columns left-to-right; sort lines within each column top-to-bottom. + +This works for simple two- or three-column layouts but fails for mixed layouts (one-column intro, then two columns below). + +### Recursive XY-cut (Ha et al. 1995, adapted for PDF) + +1. Given a rectangular region containing a set of text bounding boxes, project onto x and y axes. +2. Find the widest gap on the **dominant axis** (try y first — a full-width horizontal gap separating header from body scores higher than a narrow x gap). +3. Split the region at the gap into two sub-regions; recurse. +4. Base case: no gap exceeds a minimum threshold (e.g., one em-width on x, one line-height on y). +5. The recursion tree defines a binary partition; perform an in-order traversal to recover reading order. + +XY-cut handles mixed-column pages well. Key parameters: minimum gap width (x-cut: typically 1–2 em; y-cut: typically 0.5–1 line-height), and a maximum depth to prevent over-segmentation of slightly misaligned text. + +### Sidebars, footnotes, headers/footers + +Heuristic classification before running XY-cut: + +- **Headers/footers**: text regions whose y-centroid is within 10% of page height from the top or bottom edge, containing a small number of runs (< 3 lines). Suppress from main flow; emit separately. +- **Footnotes**: text at the bottom of the page body (above footer zone), with smaller font size than body text and often preceded by a superscript numeral. Detected by size delta > 20% relative to modal body font size. +- **Sidebars / pull quotes**: isolated text regions (large whitespace moat on all sides) with x-range contained within the column rather than spanning it. XY-cut naturally isolates these as leaf nodes; they can be reclassified by checking if any fragment overlaps the main text column's x-range. + +### Overlapping text spans + +Overlapping bounding boxes occur with drop caps, watermarks, or decorative text placed over body text. Resolution: + +- If one span has a fill color with near-zero opacity or is rendered in `Tr 3` (invisible), discard it. +- If z-order (content stream sequence) places one span significantly before another and they share x/y overlap, keep the later one (compositing intent is that it replaces the earlier). +- Otherwise keep both and emit the later content-stream span first. + +--- + +## 6. Artifacts + +PDF distinguishes **real content** from **artifacts**: pagination decorations that should not be extracted as document text. In tagged PDFs, artifacts are marked with `/Artifact` instead of a structure type (ISO 32000-2 §14.8.2.2): + +``` +/Artifact <> BMC + BT ... ET +EMC +``` + +Artifact types: `/Pagination` (headers, footers, page numbers, folios), `/Layout` (column rules, background decorations, gutters), `/Page` (cut marks, color bars in press PDFs), `/Background`. + +Pagination subtypes: `/Header`, `/Footer`, `/Watermark`. + +An extractor must skip all marked-content sequences whose outermost `BDC` tag is `/Artifact`. In untagged PDFs, artifact detection is heuristic: + +- **Page numbers**: a lone integer or "Page N of M" string in the header/footer zone with no semantic relationship to surrounding text. +- **Running headers/footers**: identical or near-identical text appearing at the same y-position across multiple pages. Compare across ≥3 consecutive pages; if the text edit-distance is < 20% (allowing page-number substitution), classify as artifact. +- **Decorative rules and backgrounds**: non-text content (paths, images) in header/footer zones — always suppress. +- **Watermarks**: large text at low opacity or with a `Tr` value ≥ 4, centered on the page. + +--- + +## 7. PDF/UA Attributes Relevant to Extraction + +PDF/UA-1 (ISO 14289-1) mandates several StructElem attributes that directly alter what text an extractor should produce. + +### ActualText (§14.9.4 of ISO 32000-2) + +Present on any StructElem or marked-content property list. When set, it **completely replaces** the visual glyph sequence for extraction purposes. Use cases: ligatures rendered as single glyphs but representing multiple characters (e.g., the `fi` glyph should yield "fi"); decorative fonts where glyph names are unreliable; redacted text replaced with a placeholder; mathematical operators. + +Extraction rule: if ActualText is present on a StructElem, output ActualText for the entire element subtree and do not recurse into child MCIDs. If ActualText is on an MCR/BDC property list, it overrides only that marked-content sequence. + +### Alt (§14.9.3) + +The alternative text attribute on `Figure`, `Formula`, and other non-text elements. An extractor producing plain text should emit Alt as a bracketed description or inline alt-text marker. PDF/UA-1 clause 7.3 requires Alt on every Figure that conveys information. An absent Alt on a Figure is a conformance violation; the extractor should emit a warning and produce a placeholder (e.g., `[Figure]`). + +### E (expansion, §14.9.5) + +Present on elements with structure type `Span` (or any inline element) to expand abbreviations. When E is present, the extractor should substitute the expansion for the visible abbreviation in the output text stream. Example: a `Span` with visible text "PDF/UA" and `E = "Portable Document Format / Universal Accessibility"`. + +### Lang (§14.9.2) + +BCP 47 language tag, inheritable from parent elements and ultimately from the document catalog's `/Lang` entry. Lang does not alter the extracted text but is essential metadata for downstream NLP (tokenization, stemming, OCR post-correction). Extraction should propagate Lang from the nearest ancestor that declares it and expose it per-element or per-run in structured output formats. + +### Attribute inheritance and the attribute object + +Attributes may be stored inline on the StructElem (`/A` key) or via named classes (`/C` key referencing ClassMap). When multiple attributes apply (inline + class), inline values take precedence over class values; class values are applied in array order. ActualText and Alt are not inheritable (they apply to exactly the element on which they appear, not descendants), while Lang is inheritable. + +--- + +## Summary: Extraction Decision Tree + +``` +Has StructTreeRoot? +├─ Yes → /Marked true and /Suspects false? +│ ├─ Yes → Walk structure tree in K-array order; apply ActualText, Alt, E, Lang; +│ │ skip /Artifact sequences. +│ └─ No → Validate structure order (spatial consistency check); +│ if order is correct → use structure tree; +│ if disordered → use spatial algorithm, annotate with structure types. +└─ No → Full spatial pipeline: heuristic artifact suppression, XY-cut column + detection, line clustering, reading order by column-then-top-to-bottom. +``` + +The structure tree, when trustworthy, yields semantically richer and more reliably ordered output than any spatial algorithm can. Spatial methods are the fallback for legacy, scanned, or poorly-tagged documents, but they remain essential because a significant fraction of PDFs in the wild are untagged or carry unreliable tags.