- Fixed missing fields in BlockJson, SpanJson, ExtractionOptions initializations - Added feature gates to ocr_integration tests for conditional compilation - Fixed McpServerState::new calls to include audit writer argument - Fixed CCITTFaxDecoder::decode calls to use instance method - Fixed type casts for ObjRef::new calls - Fixed serde_json::Value method calls (is_some -> !is_null) - Fixed ProfileType test feature gates - Worked around lifetime issues in schema roundtrip tests These changes fix numerous compilation errors that were blocking the codebase from building. The main library and tests now compile successfully. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1540 lines
93 KiB
HTML
1540 lines
93 KiB
HTML
<!DOCTYPE HTML>
|
||
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
<head>
|
||
<!-- Book generated using mdBook -->
|
||
<meta charset="UTF-8">
|
||
<title>pdftract User Documentation</title>
|
||
<meta name="robots" content="noindex">
|
||
|
||
|
||
<!-- Custom HTML head -->
|
||
|
||
<meta name="description" content="">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
<meta name="theme-color" content="#ffffff">
|
||
|
||
<link rel="icon" href="favicon-de23e50b.svg">
|
||
<link rel="shortcut icon" href="favicon-8114d1fc.png">
|
||
<link rel="stylesheet" href="css/variables-8adf115d.css">
|
||
<link rel="stylesheet" href="css/general-2459343d.css">
|
||
<link rel="stylesheet" href="css/chrome-ae938929.css">
|
||
<link rel="stylesheet" href="css/print-9e4910d8.css" media="print">
|
||
|
||
<!-- Fonts -->
|
||
<link rel="stylesheet" href="fonts/fonts-9644e21d.css">
|
||
|
||
<!-- Highlight.js Stylesheets -->
|
||
<link rel="stylesheet" id="mdbook-highlight-css" href="highlight-493f70e1.css">
|
||
<link rel="stylesheet" id="mdbook-tomorrow-night-css" href="tomorrow-night-4c0ae647.css">
|
||
<link rel="stylesheet" id="mdbook-ayu-highlight-css" href="ayu-highlight-3fdfc3ac.css">
|
||
|
||
<!-- Custom theme stylesheets -->
|
||
|
||
|
||
<!-- Provide site root and default themes to javascript -->
|
||
<script>
|
||
const path_to_root = "";
|
||
const default_light_theme = "light";
|
||
const default_dark_theme = "navy";
|
||
window.path_to_searchindex_js = "searchindex-fc6d8bf8.js";
|
||
</script>
|
||
<!-- Start loading toc.js asap -->
|
||
<script src="toc-d0f907c9.js"></script>
|
||
</head>
|
||
<body>
|
||
<div id="mdbook-help-container">
|
||
<div id="mdbook-help-popup">
|
||
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
<div>
|
||
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
<p>Press <kbd>?</kbd> to show this help</p>
|
||
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div id="mdbook-body-container">
|
||
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
<script>
|
||
try {
|
||
let theme = localStorage.getItem('mdbook-theme');
|
||
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
||
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
}
|
||
|
||
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
}
|
||
} catch (e) { }
|
||
</script>
|
||
|
||
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
<script>
|
||
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
let theme;
|
||
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
const html = document.documentElement;
|
||
html.classList.remove('light')
|
||
html.classList.add(theme);
|
||
html.classList.add("js");
|
||
</script>
|
||
|
||
<input type="checkbox" id="mdbook-sidebar-toggle-anchor" class="hidden">
|
||
|
||
<!-- Hide / unhide sidebar before it is displayed -->
|
||
<script>
|
||
let sidebar = null;
|
||
const sidebar_toggle = document.getElementById("mdbook-sidebar-toggle-anchor");
|
||
if (document.body.clientWidth >= 1080) {
|
||
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
sidebar = sidebar || 'visible';
|
||
} else {
|
||
sidebar = 'hidden';
|
||
sidebar_toggle.checked = false;
|
||
}
|
||
if (sidebar === 'visible') {
|
||
sidebar_toggle.checked = true;
|
||
} else {
|
||
html.classList.remove('sidebar-visible');
|
||
}
|
||
</script>
|
||
|
||
<nav id="mdbook-sidebar" class="sidebar" aria-label="Table of contents">
|
||
<!-- populated by js -->
|
||
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
<noscript>
|
||
<iframe class="sidebar-iframe-outer" src="toc.html"></iframe>
|
||
</noscript>
|
||
<div id="mdbook-sidebar-resize-handle" class="sidebar-resize-handle">
|
||
<div class="sidebar-resize-indicator"></div>
|
||
</div>
|
||
</nav>
|
||
|
||
<div id="mdbook-page-wrapper" class="page-wrapper">
|
||
|
||
<div class="page">
|
||
<div id="mdbook-menu-bar-hover-placeholder"></div>
|
||
<div id="mdbook-menu-bar" class="menu-bar sticky">
|
||
<div class="left-buttons">
|
||
<label id="mdbook-sidebar-toggle" class="icon-button" for="mdbook-sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="mdbook-sidebar">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"/></svg></span>
|
||
</label>
|
||
<button id="mdbook-theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="mdbook-theme-list">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M371.3 367.1c27.3-3.9 51.9-19.4 67.2-42.9L600.2 74.1c12.6-19.5 9.4-45.3-7.6-61.2S549.7-4.4 531.1 9.6L294.4 187.2c-24 18-38.2 46.1-38.4 76.1L371.3 367.1zm-19.6 25.4l-116-104.4C175.9 290.3 128 339.6 128 400c0 3.9 .2 7.8 .6 11.6c1.8 17.5-10.2 36.4-27.8 36.4H96c-17.7 0-32 14.3-32 32s14.3 32 32 32H240c61.9 0 112-50.1 112-112c0-2.5-.1-5-.2-7.5z"/></svg></span>
|
||
</button>
|
||
<ul id="mdbook-theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-default_theme">Auto</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-light">Light</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-rust">Rust</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-coal">Coal</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-navy">Navy</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-ayu">Ayu</button></li>
|
||
</ul>
|
||
<button id="mdbook-search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="mdbook-searchbar">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M416 208c0 45.9-14.9 88.3-40 122.7L502.6 457.4c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L330.7 376c-34.4 25.2-76.8 40-122.7 40C93.1 416 0 322.9 0 208S93.1 0 208 0S416 93.1 416 208zM208 352c79.5 0 144-64.5 144-144s-64.5-144-144-144S64 128.5 64 208s64.5 144 144 144z"/></svg></span>
|
||
</button>
|
||
</div>
|
||
|
||
<h1 class="menu-title">pdftract User Documentation</h1>
|
||
|
||
<div class="right-buttons">
|
||
<a href="print.html" title="Print this book" aria-label="Print this book">
|
||
<span class=fa-svg id="print-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M128 0C92.7 0 64 28.7 64 64v96h64V64H354.7L384 93.3V160h64V93.3c0-17-6.7-33.3-18.7-45.3L400 18.7C388 6.7 371.7 0 354.7 0H128zM384 352v32 64H128V384 368 352H384zm64 32h32c17.7 0 32-14.3 32-32V256c0-35.3-28.7-64-64-64H64c-35.3 0-64 28.7-64 64v96c0 17.7 14.3 32 32 32H64v64c0 35.3 28.7 64 64 64H384c35.3 0 64-28.7 64-64V384zm-16-88c-13.3 0-24-10.7-24-24s10.7-24 24-24s24 10.7 24 24s-10.7 24-24 24z"/></svg></span>
|
||
</a>
|
||
<a href="https://github.com/jedarden/pdftract" title="Git repository" aria-label="Git repository">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg></span>
|
||
</a>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<div id="mdbook-search-wrapper" class="hidden">
|
||
<form id="mdbook-searchbar-outer" class="searchbar-outer">
|
||
<div class="search-wrapper">
|
||
<input type="search" id="mdbook-searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="mdbook-searchresults-outer" aria-describedby="searchresults-header">
|
||
<div class="spinner-wrapper">
|
||
<span class=fa-svg id="fa-spin"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M304 48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zm0 416c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM48 304c26.5 0 48-21.5 48-48s-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48zm464-48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM142.9 437c18.7-18.7 18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zm0-294.2c18.7-18.7 18.7-49.1 0-67.9S93.7 56.2 75 75s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zM369.1 437c18.7 18.7 49.1 18.7 67.9 0s18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9z"/></svg></span>
|
||
</div>
|
||
</div>
|
||
</form>
|
||
<div id="mdbook-searchresults-outer" class="searchresults-outer hidden">
|
||
<div id="mdbook-searchresults-header" class="searchresults-header"></div>
|
||
<ul id="mdbook-searchresults">
|
||
</ul>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
<script>
|
||
document.getElementById('mdbook-sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
document.getElementById('mdbook-sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
Array.from(document.querySelectorAll('#mdbook-sidebar a')).forEach(function(link) {
|
||
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
});
|
||
</script>
|
||
|
||
<div id="mdbook-content" class="content">
|
||
<main>
|
||
<h1 id="introduction"><a class="header" href="#introduction">Introduction</a></h1>
|
||
<h2 id="what-pdftract-does"><a class="header" href="#what-pdftract-does">What pdftract Does</a></h2>
|
||
<p>pdftract is a PDF text extraction library that gets the hard parts right. Unlike naive PDF parsers that dump text in the order it appears in the PDF file (which is rarely the correct reading order), pdftract understands document layout and recovers the logical structure that humans perceive when reading a page.</p>
|
||
<h3 id="core-features"><a class="header" href="#core-features">Core Features</a></h3>
|
||
<p><strong>Correct reading order</strong> — Layout regions are segmented and sequenced before text is emitted, handling multi-column pages, sidebars, footnotes, and mixed-layout documents without relying on PDF operator order. pdftract groups text into semantic blocks (headings, paragraphs, lists, tables) and outputs them in the order a human would read.</p>
|
||
<p><strong>Font encoding recovery</strong> — When <code>ToUnicode</code> CMaps are absent, wrong, or incomplete (a common problem in PDFs generated by legacy tools), pdftract works through a layered recovery pipeline: glyph name lookup via the Adobe Glyph List, font fingerprinting against known metrics and embedded checksums, and glyph outline shape matching. This means you get readable Unicode text even from broken PDFs.</p>
|
||
<p><strong>Structure tree extraction</strong> — PDF/UA and PDF/A documents encode their logical structure (headings, paragraphs, lists, tables, reading order) in a <code>StructTree</code>. pdftract reads this directly when present, producing accurate semantic output at no extra cost. Tagged PDFs yield near-perfect extraction.</p>
|
||
<p><strong>Per-page hybrid routing</strong> — Each page is independently classified and routed to the appropriate pipeline: vector text extraction (for pages with embedded fonts), full OCR (for scanned pages), or assisted OCR where vector hints improve raster accuracy. This hybrid approach optimizes for both accuracy and speed.</p>
|
||
<p><strong>Structured output with provenance</strong> — The primary output is JSON carrying per-span bounding boxes, font name, size, and confidence score alongside the extracted text, not a flat string dump. You get rich metadata that enables downstream processing: layout analysis, font-aware styling, highlight extraction, and confidence-based filtering.</p>
|
||
<h3 id="what-you-can-extract"><a class="header" href="#what-you-can-extract">What You Can Extract</a></h3>
|
||
<ul>
|
||
<li><strong>Text</strong> — Plain text or structured JSON with per-character provenance</li>
|
||
<li><strong>Layout</strong> — Bounding boxes for blocks, lines, and spans</li>
|
||
<li><strong>Metadata</strong> — Title, author, creation date, page count, PDF version</li>
|
||
<li><strong>Structure</strong> — Headings, paragraphs, lists, tables (when present in the PDF)</li>
|
||
<li><strong>Annotations</strong> — Comments, highlights, form fields (Phase 7)</li>
|
||
</ul>
|
||
<h3 id="what-pdftract-does-not-do"><a class="header" href="#what-pdftract-does-not-do">What pdftract Does Not Do</a></h3>
|
||
<p>pdftract is deliberately scoped. The following features are <strong>not</strong> in scope for v1.0.0:</p>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Non-goal</th><th>Alternative</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td>PDF authoring or writing</td><td><code>lopdf</code>, <code>pdfium-render</code>, <code>printpdf</code></td></tr>
|
||
<tr><td>Full PDF rendering / printing</td><td>PDFium, MuPDF, Poppler</td></tr>
|
||
<tr><td>Cryptographic signature validation</td><td><code>openssl smime</code>, dedicated PKI libraries</td></tr>
|
||
<tr><td>Translation of extracted text</td><td>LibreTranslate, DeepL, Argos</td></tr>
|
||
<tr><td>Summarization</td><td>LLM tools via the MCP server integration</td></tr>
|
||
<tr><td>OCR engine training</td><td>Tesseract’s <code>tesstrain</code> tooling</td></tr>
|
||
<tr><td>Filling out PDF forms</td><td>Form-filling tools with authoring support</td></tr>
|
||
<tr><td>Watermark removal</td><td>Detected and excluded from output, not removed from PDF</td></tr>
|
||
<tr><td>Password cracking</td><td><code>pdfcrack</code>, <code>john</code></td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>For the full rationale and scope-lock doctrine, see the <a href="../../plan/plan.html#non-goals">Non-Goals section</a> in the project plan.</p>
|
||
<h2 id="supported-pdf-features"><a class="header" href="#supported-pdf-features">Supported PDF Features</a></h2>
|
||
<p>pdftract supports PDF 1.4 through PDF 2.0, with varying levels of feature coverage:</p>
|
||
<ul>
|
||
<li><strong>Text extraction</strong> — Full support for Type 1, TrueType, OpenType, and CID-keyed fonts</li>
|
||
<li><strong>Compression</strong> — All standard filters (FlateDecode, ASCIIHex, ASCII85, RunLength, CCITT, DCT)</li>
|
||
<li><strong>Encryption</strong> — RC4 40-bit, RC4 128-bit, AES-128, AES-256 (password required)</li>
|
||
<li><strong>Structure trees</strong> — PDF/UA logical structure reading</li>
|
||
<li><strong>Forms</strong> — AcroForm and XFA field extraction (read-only)</li>
|
||
<li><strong>Signatures</strong> — Signature metadata extraction (validation not performed)</li>
|
||
<li><strong>Attachments</strong> — File attachment extraction</li>
|
||
<li><strong>Articles</strong> — Thread extraction for logical reading flows</li>
|
||
</ul>
|
||
<p>See the <a href="advanced">Advanced Topics</a> section for deep dives into specific features.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="installation"><a class="header" href="#installation">Installation</a></h1>
|
||
<p>pdftract is distributed as a native binary, a Python package, and a Docker image. Choose the installation method that matches your workflow.</p>
|
||
<h2 id="install-via-cargo"><a class="header" href="#install-via-cargo">Install via Cargo</a></h2>
|
||
<pre><code class="language-bash">cargo install pdftract
|
||
</code></pre>
|
||
<p>This installs the <code>pdftract</code> binary in <code>~/.cargo/bin/</code>. Make sure <code>~/.cargo/bin</code> is in your <code>PATH</code>.</p>
|
||
<h3 id="pre-built-binaries"><a class="header" href="#pre-built-binaries">Pre-built Binaries</a></h3>
|
||
<p>Pre-built binaries are available from <a href="https://github.com/jedarden/pdftract/releases">GitHub Releases</a>. Download the archive for your platform, extract, and place the binary in your <code>PATH</code>.</p>
|
||
<h3 id="cargo-binstall"><a class="header" href="#cargo-binstall">Cargo Binstall</a></h3>
|
||
<p>For faster installation without compiling from source:</p>
|
||
<pre><code class="language-bash">cargo binstall pdftract
|
||
</code></pre>
|
||
<p>This downloads a pre-built binary from the GitHub Release instead of compiling locally.</p>
|
||
<h2 id="install-via-pip"><a class="header" href="#install-via-pip">Install via pip</a></h2>
|
||
<p>pdftract is distributed on PyPI as a native Python extension with PyO3 bindings.</p>
|
||
<pre><code class="language-bash">pip install pdftract
|
||
</code></pre>
|
||
<p>The Python package includes the same extraction engine as the CLI, accessible via a Python API. See <a href="#python-sdk">Python SDK</a> for usage.</p>
|
||
<h3 id="platform-wheels"><a class="header" href="#platform-wheels">Platform Wheels</a></h3>
|
||
<p>Wheels are available for:</p>
|
||
<ul>
|
||
<li>Linux <code>x86_64</code> (manylinux2014, musllinux)</li>
|
||
<li>macOS <code>x86_64</code> and <code>arm64</code></li>
|
||
<li>Windows <code>x86_64</code></li>
|
||
</ul>
|
||
<p>If no wheel is available for your platform, pip will fall back to building from source (requires Rust toolchain).</p>
|
||
<h2 id="install-via-homebrew"><a class="header" href="#install-via-homebrew">Install via Homebrew</a></h2>
|
||
<p><strong>Note:</strong> Homebrew formula is deferred to v1.1+. In the meantime, use <code>cargo install pdftract</code> or the Docker image.</p>
|
||
<p>See the <a href="../../plan/plan.html#non-goals">Non-Goals section</a> in the project plan for the rationale.</p>
|
||
<h2 id="install-via-docker"><a class="header" href="#install-via-docker">Install via Docker</a></h2>
|
||
<p>Docker images are available on GitHub Container Registry:</p>
|
||
<pre><code class="language-bash">docker pull ghcr.io/jedarden/pdftract:latest
|
||
docker run --rm -v $(pwd):/work ghcr.io/jedarden/pdftract:latest extract /work/document.pdf
|
||
</code></pre>
|
||
<h3 id="image-variants"><a class="header" href="#image-variants">Image Variants</a></h3>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Tag</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>latest</code></td><td>Default features (vector extraction, basic OCR)</td></tr>
|
||
<tr><td><code>ocr</code></td><td>Includes Tesseract for full OCR support</td></tr>
|
||
<tr><td><code>full</code></td><td>All features including PDFium for rasterization</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>Multi-arch manifests support <code>amd64</code> and <code>arm64</code> platforms.</p>
|
||
<h2 id="platform-support"><a class="header" href="#platform-support">Platform Support</a></h2>
|
||
<h3 id="supported-platforms"><a class="header" href="#supported-platforms">Supported Platforms</a></h3>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Platform</th><th>CI Status</th><th>Notes</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td>Linux <code>x86_64</code> (glibc)</td><td>Fully CI-tested</td><td>Primary development platform</td></tr>
|
||
<tr><td>Linux <code>x86_64</code> (musl)</td><td>Fully CI-tested</td><td>Alpine-compatible</td></tr>
|
||
<tr><td>Linux <code>arm64</code> (glibc)</td><td>Fully CI-tested</td><td>ARM64 servers (e.g., Graviton)</td></tr>
|
||
<tr><td>Linux <code>arm64</code> (musl)</td><td>Fully CI-tested</td><td>Alpine ARM64</td></tr>
|
||
<tr><td>macOS <code>x86_64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
<tr><td>macOS <code>arm64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
<tr><td>Windows <code>x86_64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="cross-platform-test-limitation-ku-12"><a class="header" href="#cross-platform-test-limitation-ku-12">Cross-Platform Test Limitation (KU-12)</a></h3>
|
||
<blockquote>
|
||
<p><strong>Linux is fully CI-tested; macOS and Windows are build-tested and manually smoke-tested per release.</strong></p>
|
||
</blockquote>
|
||
<p>Per project architecture decision ADR-009, the CI pipeline runs on Linux-only infrastructure (<code>iad-ci</code>). macOS and Windows binaries are <strong>built</strong> via cross-compilation but are never <strong>executed</strong> in automated CI. This is acknowledged as Known Unknown KU-12 with the following mitigation:</p>
|
||
<ul>
|
||
<li>A manual smoke-test runbook is executed by the release lead before each milestone against at least one physical macOS machine and one Windows VM</li>
|
||
<li>User bug reports for platform-specific issues are acknowledged within 48 hours and addressed in the next patch release</li>
|
||
<li>No claim of “tested on macOS/Windows” appears in CI status badges</li>
|
||
</ul>
|
||
<p>If you encounter a platform-specific issue on macOS or Windows, please file a bug report. The project is committed to fixing platform bugs promptly.</p>
|
||
<h3 id="minimum-rust-version"><a class="header" href="#minimum-rust-version">Minimum Rust Version</a></h3>
|
||
<p>If building from source, pdftract requires Rust 1.78 or later. The MSRV is pinned in <code>Cargo.toml</code> and tested on every PR.</p>
|
||
<h2 id="verifying-installation"><a class="header" href="#verifying-installation">Verifying Installation</a></h2>
|
||
<p>Run the following command to verify your installation:</p>
|
||
<pre><code class="language-bash">pdftract --version
|
||
</code></pre>
|
||
<p>You should see output like:</p>
|
||
<pre><code>pdftract 0.1.0
|
||
</code></pre>
|
||
<p>For the Python package:</p>
|
||
<pre><code class="language-bash">python -c "import pdftract; print(pdftract.__version__)"
|
||
</code></pre>
|
||
<h3 id="environment-health-check"><a class="header" href="#environment-health-check">Environment Health Check</a></h3>
|
||
<p>After installation, verify your environment is properly configured for pdftract:</p>
|
||
<pre><code class="language-bash">pdftract doctor
|
||
</code></pre>
|
||
<p>This validates that all OS-level dependencies (Tesseract, leptonica, libtiff, etc.) are installed and correctly configured. See the <a href="../../operations/manual-platform-smoke.html">Operations Runbook</a> for detailed troubleshooting of each check.</p>
|
||
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
|
||
<p>Once installed, proceed to the <a href="#quickstart">Quickstart</a> for a five-minute walkthrough of pdftract’s core features.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="quickstart"><a class="header" href="#quickstart">Quickstart</a></h1>
|
||
<p>This five-minute walkthrough covers the core pdftract workflow: extract text from a PDF, inspect the structured JSON output, and try profile-based extraction.</p>
|
||
<h2 id="five-minute-walkthrough"><a class="header" href="#five-minute-walkthrough">Five-Minute Walkthrough</a></h2>
|
||
<h3 id="prerequisites"><a class="header" href="#prerequisites">Prerequisites</a></h3>
|
||
<ul>
|
||
<li>pdftract installed (see <a href="#installation">Installation</a>)</li>
|
||
<li>A PDF file to extract (any PDF will do)</li>
|
||
</ul>
|
||
<p>If you don’t have a PDF handy, you can use the sample fixtures from the pdftract repository:</p>
|
||
<pre><code class="language-bash">git clone https://github.com/jedarden/pdftract.git
|
||
cd pdftract
|
||
</code></pre>
|
||
<h3 id="verify-your-environment"><a class="header" href="#verify-your-environment">Verify Your Environment</a></h3>
|
||
<p>Before extracting, verify your environment is properly configured:</p>
|
||
<pre><code class="language-bash">pdftract doctor
|
||
</code></pre>
|
||
<p>Expected output:</p>
|
||
<pre><code>Check Status Detail
|
||
─────────────────────────────────────────────
|
||
pdftract binary OK 0.1.0 (git: abc1234)
|
||
tesseract install OK v5.3.0
|
||
...
|
||
</code></pre>
|
||
<p>If any check shows FAIL, see the <a href="../../operations/manual-platform-smoke.html#troubleshooting">Operations Runbook</a> for resolution steps.</p>
|
||
<h3 id="extract-your-first-pdf"><a class="header" href="#extract-your-first-pdf">Extract Your First PDF</a></h3>
|
||
<p>The simplest extraction outputs plain text to stdout:</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf
|
||
</code></pre>
|
||
<p>For structured JSON output (default):</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf --output result.json
|
||
</code></pre>
|
||
<p>Or view JSON directly in your terminal (pipe to <code>jq</code> for pretty-printing):</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf | jq .
|
||
</code></pre>
|
||
<h3 id="inspect-the-output"><a class="header" href="#inspect-the-output">Inspect the Output</a></h3>
|
||
<p>The JSON output contains:</p>
|
||
<ul>
|
||
<li><strong><code>pages</code></strong> — Array of page objects, each with <code>blocks</code> and <code>spans</code></li>
|
||
<li><strong><code>blocks</code></strong> — Semantic elements (headings, paragraphs, lists) with reading order</li>
|
||
<li><strong><code>spans</code></strong> — Text fragments with bounding boxes, font metadata, and confidence scores</li>
|
||
<li><strong><code>metadata</code></strong> — Document title, author, page count, PDF version</li>
|
||
</ul>
|
||
<p>Example:</p>
|
||
<pre><code class="language-json">{
|
||
"pages": [
|
||
{
|
||
"page": 1,
|
||
"width": 612,
|
||
"height": 792,
|
||
"blocks": [
|
||
{
|
||
"kind": "heading",
|
||
"text": "Introduction",
|
||
"bbox": [72, 680, 400, 700],
|
||
"level": 1
|
||
},
|
||
{
|
||
"kind": "paragraph",
|
||
"text": "This is the first paragraph...",
|
||
"bbox": [72, 640, 540, 670]
|
||
}
|
||
],
|
||
"spans": [
|
||
{
|
||
"text": "Introduction",
|
||
"bbox": [72, 680, 400, 700],
|
||
"font": "Times-Bold",
|
||
"size": 14.0,
|
||
"confidence": 0.99
|
||
}
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"title": "Sample Document",
|
||
"author": "John Doe",
|
||
"page_count": 1,
|
||
"pdf_version": "1.4"
|
||
}
|
||
}
|
||
</code></pre>
|
||
<h3 id="try-auto-profile-mode"><a class="header" href="#try-auto-profile-mode">Try Auto-Profile Mode</a></h3>
|
||
<p>pdftract includes built-in profiles for common document types (invoices, receipts, contracts, etc.). Use <code>--auto</code> to automatically detect the profile:</p>
|
||
<pre><code class="language-bash">pdftract extract invoice.pdf --auto
|
||
</code></pre>
|
||
<p>The auto-detected profile is logged to stderr:</p>
|
||
<pre><code>[INFO] Detected profile: invoice
|
||
</code></pre>
|
||
<p>Profiles optimize extraction for specific document layouts:</p>
|
||
<ul>
|
||
<li><strong>invoice</strong> — Extract line items, totals, vendor info</li>
|
||
<li><strong>receipt</strong> — Extract merchant, date, line items, tax, total</li>
|
||
<li><strong>contract</strong> — Extract parties, effective date, clauses</li>
|
||
<li><strong>bank_statement</strong> — Extract account info, statement period, transactions</li>
|
||
</ul>
|
||
<p>See <a href="#available-profiles">Profiles</a> for the full list.</p>
|
||
<h3 id="batch-processing"><a class="header" href="#batch-processing">Batch Processing</a></h3>
|
||
<p>To extract multiple PDFs in a folder:</p>
|
||
<pre><code class="language-bash">pdftract extract *.pdf --output-dir results/
|
||
</code></pre>
|
||
<p>Each PDF produces a corresponding JSON file in <code>results/</code>:</p>
|
||
<pre><code>results/
|
||
invoice1.pdf.json
|
||
invoice2.pdf.json
|
||
receipt.pdf.json
|
||
</code></pre>
|
||
<p>For recursive folder processing, use the <code>grep</code> command to search across all PDFs:</p>
|
||
<pre><code class="language-bash">pdftract grep "search term" /path/to/folder
|
||
</code></pre>
|
||
<p>This outputs matching filenames and page numbers:</p>
|
||
<pre><code>invoice.pdf:3: "search term" found on page 3
|
||
receipt.pdf:1: "search term" found on page 1
|
||
</code></pre>
|
||
<h2 id="common-options"><a class="header" href="#common-options">Common Options</a></h2>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Option</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>--output FILE</code></td><td>Write output to file instead of stdout</td></tr>
|
||
<tr><td><code>--text</code></td><td>Output plain text instead of JSON</td></tr>
|
||
<tr><td><code>--output-dir DIR</code></td><td>Directory for batch output (with <code>*</code> glob)</td></tr>
|
||
<tr><td><code>--auto</code></td><td>Auto-detect and apply document profile</td></tr>
|
||
<tr><td><code>--profile NAME</code></td><td>Use specific profile (skip auto-detection)</td></tr>
|
||
<tr><td><code>--password PASS</code></td><td>Password for encrypted PDFs</td></tr>
|
||
<tr><td><code>--pages N-M</code></td><td>Extract specific page range</td></tr>
|
||
<tr><td><code>--ocr</code></td><td>Force OCR mode for all pages</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>See <a href="cli">CLI Reference</a> for complete command documentation.</p>
|
||
<h2 id="whats-next"><a class="header" href="#whats-next">What’s Next?</a></h2>
|
||
<ul>
|
||
<li>Explore the <a href="cli">CLI Reference</a> for advanced options</li>
|
||
<li>Read <a href="schema">JSON Schema Reference</a> for output format details</li>
|
||
<li>Check <a href="profiles">Profiles</a> for document-type-specific extraction</li>
|
||
<li>Try the <a href="#python-sdk">Python SDK</a> for programmatic access</li>
|
||
</ul>
|
||
<h2 id="troubleshooting"><a class="header" href="#troubleshooting">Troubleshooting</a></h2>
|
||
<p><strong>Extraction fails with “unsupported encryption”</strong></p>
|
||
<p>The PDF is encrypted with a password. Use <code>--password</code>:</p>
|
||
<pre><code class="language-bash">pdftract extract encrypted.pdf --password yourpassword
|
||
</code></pre>
|
||
<p><strong>Output has wrong reading order</strong></p>
|
||
<p>Some PDFs have malformed internal structure. Try <code>--auto</code> to enable profile-based layout recovery, or use <code>--ocr</code> to force OCR-based extraction.</p>
|
||
<p><strong>Poor accuracy on scanned documents</strong></p>
|
||
<p>Ensure the OCR features are enabled. The Docker <code>:ocr</code> and <code>:full</code> images include Tesseract. If building from source, enable the <code>ocr</code> feature:</p>
|
||
<pre><code class="language-bash">cargo install pdftract --features ocr
|
||
</code></pre>
|
||
<p>For more help, see <a href="troubleshooting">Troubleshooting</a>.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="cli-reference"><a class="header" href="#cli-reference">CLI Reference</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Complete command-line interface documentation.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="global-options"><a class="header" href="#global-options">Global Options</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>See the main pdftract repository for CLI usage details.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="extract"><a class="header" href="#extract">extract</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extract text and structure from a PDF file.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="serve"><a class="header" href="#serve">serve</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Start an HTTP server for PDF extraction.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="grep"><a class="header" href="#grep">grep</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Search for text across multiple PDF files.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="inspect"><a class="header" href="#inspect">inspect</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Inspect PDF structure and metadata.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="mcp"><a class="header" href="#mcp">mcp</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Run pdftract as an MCP (Model Context Protocol) server.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="json-schema-reference"><a class="header" href="#json-schema-reference">JSON Schema Reference</a></h1>
|
||
<blockquote>
|
||
<p><strong>Schema version:</strong> 1.0<br><strong>Schema URL:</strong> https://pdftract.com/schema/v1.0/pdftract.schema.json<br><strong>Source of truth:</strong> <code>docs/schema/v1.0/pdftract.schema.json</code></p>
|
||
</blockquote>
|
||
<p>This page provides a human-readable rendering of the pdftract output schema. The JSON Schema is the authoritative definition (per <a href="../plan/plan.html">INV-11</a>), validated in CI for all test fixtures.</p>
|
||
<h2 id="top-level-structure"><a class="header" href="#top-level-structure">Top-Level Structure</a></h2>
|
||
<pre><code class="language-json">{
|
||
"fingerprint": "pdftract-v1:a7f3c8d9...",
|
||
"pages": [...],
|
||
"metadata": {...},
|
||
"signatures": [...],
|
||
"form_fields": [...]
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>fingerprint</code></td><td>string</td><td>Yes</td><td>Phase 1.7 fingerprint of the source PDF. Format: <code>"pdftract-v1:" + hex(SHA-256)</code>. Used for receipt verification.</td></tr>
|
||
<tr><td><code>pages</code></td><td>array</td><td>Yes</td><td>Extracted pages, each containing spans and blocks.</td></tr>
|
||
<tr><td><code>metadata</code></td><td>object</td><td>Yes</td><td>ExtractionMetadata object with page count, diagnostics, receipts mode, etc.</td></tr>
|
||
<tr><td><code>signatures</code></td><td>array</td><td>Yes</td><td>Digital signatures extracted from the document. Empty when no signature fields exist.</td></tr>
|
||
<tr><td><code>form_fields</code></td><td>array</td><td>Yes</td><td>Interactive form fields from AcroForm/XFA. Empty when no form fields exist.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="document-metadata"><a class="header" href="#document-metadata">Document Metadata</a></h2>
|
||
<p>The <code>metadata</code> object contains extraction-level information:</p>
|
||
<pre><code class="language-json">{
|
||
"page_count": 10,
|
||
"span_count": 842,
|
||
"block_count": 156,
|
||
"error_count": 0,
|
||
"receipts_mode": "off",
|
||
"diagnostics": ["WARN: page 3: low coverage (54%) - possible scanned content"],
|
||
"cache_status": "hit",
|
||
"cache_age_seconds": 1240,
|
||
"reading_order_algorithm": "robust-topo"
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>page_count</code></td><td>integer</td><td>Total number of pages in the document.</td></tr>
|
||
<tr><td><code>span_count</code></td><td>integer</td><td>Number of spans extracted across all pages.</td></tr>
|
||
<tr><td><code>block_count</code></td><td>integer</td><td>Number of blocks extracted across all pages.</td></tr>
|
||
<tr><td><code>error_count</code></td><td>integer</td><td>Number of pages that failed to extract.</td></tr>
|
||
<tr><td><code>receipts_mode</code></td><td>string</td><td>Receipts mode used: <code>"off"</code>, <code>"lite"</code>, or <code>"svg"</code>.</td></tr>
|
||
<tr><td><code>diagnostics</code></td><td>array</td><td>Diagnostic messages emitted during extraction (coverage warnings, etc.).</td></tr>
|
||
<tr><td><code>cache_status</code></td><td>string/null</td><td>Cache status: <code>"hit"</code>, <code>"miss"</code>, or <code>"skipped"</code>.</td></tr>
|
||
<tr><td><code>cache_age_seconds</code></td><td>integer/null</td><td>Cache entry age in seconds (only present when <code>cache_status == "hit"</code>).</td></tr>
|
||
<tr><td><code>reading_order_algorithm</code></td><td>string/null</td><td>Reading order algorithm used for this extraction.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="page-result"><a class="header" href="#page-result">Page Result</a></h2>
|
||
<p>Each page in the <code>pages</code> array contains:</p>
|
||
<pre><code class="language-json">{
|
||
"index": 0,
|
||
"spans": [...],
|
||
"blocks": [...],
|
||
"tables": [...],
|
||
"error": null
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>index</code></td><td>integer</td><td>Yes</td><td>Zero-based page index. This is the canonical identifier for programmatic use.</td></tr>
|
||
<tr><td><code>spans</code></td><td>array</td><td>Yes</td><td>Extracted spans (text fragments with consistent styling).</td></tr>
|
||
<tr><td><code>blocks</code></td><td>array</td><td>Yes</td><td>Extracted blocks (semantic units like paragraphs, headings).</td></tr>
|
||
<tr><td><code>tables</code></td><td>array</td><td>Yes</td><td>Extracted tables with cell-level structure. Empty when no tables detected.</td></tr>
|
||
<tr><td><code>error</code></td><td>string/null</td><td>Yes</td><td>Error message if extraction failed for this page.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="span"><a class="header" href="#span">Span</a></h3>
|
||
<p>A span is the smallest unit of extracted text, representing a contiguous run of text with consistent font and styling.</p>
|
||
<pre><code class="language-json">{
|
||
"text": "The quick brown fox",
|
||
"bbox": [72.0, 612.0, 245.5, 624.3],
|
||
"font": "Helvetica-Bold",
|
||
"size": 12.0,
|
||
"column": 0,
|
||
"confidence": 0.98,
|
||
"receipt": null
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>text</code></td><td>string</td><td>Yes</td><td>The extracted text content.</td></tr>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points. Format: <code>[x0, y0, x1, y1]</code> where (x0, y0) is the bottom-left corner and (x1, y1) is the top-right corner. Units are 1/72 inch.</td></tr>
|
||
<tr><td><code>font</code></td><td>string</td><td>Yes</td><td>Font name or identifier.</td></tr>
|
||
<tr><td><code>size</code></td><td>number</td><td>Yes</td><td>Font size in points.</td></tr>
|
||
<tr><td><code>column</code></td><td>integer/null</td><td>No</td><td>Column index (0-based) assigned by Phase 4.3 column detection. Null for spans outside any detected column.</td></tr>
|
||
<tr><td><code>confidence</code></td><td>number/null</td><td>No</td><td>Confidence score (0.0 to 1.0). Present when OCR is used or extraction has uncertainty.</td></tr>
|
||
<tr><td><code>receipt</code></td><td>object/null</td><td>No</td><td>Cryptographic receipt for verification. Present when <code>--receipts=lite</code> or <code>--receipts=svg</code> is enabled.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="block"><a class="header" href="#block">Block</a></h3>
|
||
<p>A block is a higher-level semantic unit composed of one or more spans.</p>
|
||
<pre><code class="language-json">{
|
||
"kind": "paragraph",
|
||
"text": "The quick brown fox jumps over the lazy dog.",
|
||
"bbox": [72.0, 600.0, 540.0, 650.0],
|
||
"level": null,
|
||
"table_index": null
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>kind</code></td><td>string</td><td>Yes</td><td>The block kind/type. Common values: <code>"paragraph"</code>, <code>"heading"</code>, <code>"list"</code>, <code>"table"</code>, <code>"figure"</code>.</td></tr>
|
||
<tr><td><code>text</code></td><td>string</td><td>Yes</td><td>The concatenated text content of all spans in the block.</td></tr>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points. Same format as spans.</td></tr>
|
||
<tr><td><code>level</code></td><td>integer/null</td><td>No</td><td>Heading level (1-6) for <code>"heading"</code> kind blocks. Null for other block types.</td></tr>
|
||
<tr><td><code>table_index</code></td><td>integer/null</td><td>No</td><td>Table index for <code>"table"</code> kind blocks. Points to the corresponding entry in the page’s <code>tables</code> array.</td></tr>
|
||
<tr><td><code>receipt</code></td><td>object/null</td><td>No</td><td>Cryptographic receipt for verification. Present when receipts are enabled.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h4 id="block-kind-enum"><a class="header" href="#block-kind-enum">Block Kind Enum</a></h4>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Value</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>paragraph</code></td><td>A paragraph block.</td></tr>
|
||
<tr><td><code>heading</code></td><td>A heading block (with <code>level</code> field 1-6).</td></tr>
|
||
<tr><td><code>list</code></td><td>A list item block.</td></tr>
|
||
<tr><td><code>table</code></td><td>A table block (references <code>tables</code> array via <code>table_index</code>).</td></tr>
|
||
<tr><td><code>figure</code></td><td>A figure or image block.</td></tr>
|
||
<tr><td><code>code</code></td><td>A code block or monospace text.</td></tr>
|
||
<tr><td><code>formula</code></td><td>A mathematical formula.</td></tr>
|
||
<tr><td><code>header</code></td><td>A page header block.</td></tr>
|
||
<tr><td><code>footer</code></td><td>A page footer block.</td></tr>
|
||
<tr><td><code>watermark</code></td><td>A watermark block.</td></tr>
|
||
<tr><td><code>caption</code></td><td>A caption for a figure or table.</td></tr>
|
||
<tr><td><code>quote</code></td><td>A blockquote.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="table"><a class="header" href="#table">Table</a></h3>
|
||
<p>Tables provide detailed cell-level structure for table blocks.</p>
|
||
<pre><code class="language-json">{
|
||
"id": "table_0",
|
||
"page_index": 2,
|
||
"bbox": [72.0, 400.0, 540.0, 550.0],
|
||
"detection_method": "line_based",
|
||
"header_rows": 1,
|
||
"continued": false,
|
||
"continued_from_prev": false,
|
||
"rows": [...]
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>id</code></td><td>string</td><td>Yes</td><td>Unique identifier for this table (e.g., <code>"table_0"</code>).</td></tr>
|
||
<tr><td><code>page_index</code></td><td>integer</td><td>Yes</td><td>Zero-based page index where this table appears.</td></tr>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points.</td></tr>
|
||
<tr><td><code>detection_method</code></td><td>string</td><td>Yes</td><td>Detection method: <code>"line_based"</code> (ruling lines) or <code>"borderless"</code> (x0 alignment heuristics).</td></tr>
|
||
<tr><td><code>header_rows</code></td><td>integer</td><td>Yes</td><td>Number of contiguous header rows at the top of the table.</td></tr>
|
||
<tr><td><code>continued</code></td><td>boolean</td><td>Yes</td><td>Whether this table continues on the next page.</td></tr>
|
||
<tr><td><code>continued_from_prev</code></td><td>boolean</td><td>Yes</td><td>Whether this table is a continuation from the previous page.</td></tr>
|
||
<tr><td><code>rows</code></td><td>array</td><td>Yes</td><td>Rows in this table, ordered top-to-bottom.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h4 id="row"><a class="header" href="#row">Row</a></h4>
|
||
<p>Each row contains cells ordered left-to-right:</p>
|
||
<pre><code class="language-json">{
|
||
"bbox": [72.0, 520.0, 540.0, 540.0],
|
||
"is_header": true,
|
||
"cells": [...]
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points.</td></tr>
|
||
<tr><td><code>is_header</code></td><td>boolean</td><td>Yes</td><td>Whether this row is a header row.</td></tr>
|
||
<tr><td><code>cells</code></td><td>array</td><td>Yes</td><td>Cells in this row, ordered left-to-right.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h4 id="cell"><a class="header" href="#cell">Cell</a></h4>
|
||
<pre><code class="language-json">{
|
||
"text": "Revenue",
|
||
"bbox": [72.0, 520.0, 180.0, 540.0],
|
||
"row": 0,
|
||
"col": 0,
|
||
"rowspan": 1,
|
||
"colspan": 1,
|
||
"is_header_row": true,
|
||
"spans": [0, 1]
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>text</code></td><td>string</td><td>Yes</td><td>The concatenated text content of all spans in the cell.</td></tr>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points.</td></tr>
|
||
<tr><td><code>row</code></td><td>integer</td><td>Yes</td><td>Zero-based row index within the table.</td></tr>
|
||
<tr><td><code>col</code></td><td>integer</td><td>Yes</td><td>Zero-based column index within the table.</td></tr>
|
||
<tr><td><code>rowspan</code></td><td>integer</td><td>Yes</td><td>Number of rows this cell spans (default 1).</td></tr>
|
||
<tr><td><code>colspan</code></td><td>integer</td><td>Yes</td><td>Number of columns this cell spans (default 1).</td></tr>
|
||
<tr><td><code>is_header_row</code></td><td>boolean</td><td>Yes</td><td>Whether this cell is in a header row.</td></tr>
|
||
<tr><td><code>spans</code></td><td>array</td><td>Yes</td><td>References to spans in the page’s <code>spans</code> array (indices).</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="form-fields-phase-74"><a class="header" href="#form-fields-phase-74">Form Fields (Phase 7.4)</a></h2>
|
||
<p>Form fields represent interactive form fields from the PDF’s AcroForm or XFA data.</p>
|
||
<blockquote>
|
||
<p><strong>Note:</strong> Phase 7 placeholders are documented here for forward-compatibility. Fields are present in the schema but return empty arrays until Phase 7 implementation.</p>
|
||
</blockquote>
|
||
<pre><code class="language-json">{
|
||
"name": "employer_signature",
|
||
"type": "text",
|
||
"value": "John Doe",
|
||
"default": null,
|
||
"read_only": false,
|
||
"required": true,
|
||
"page_index": 2,
|
||
"rect": [72.0, 400.0, 288.0, 420.0],
|
||
"multiline": true,
|
||
"max_length": 100
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>name</code></td><td>string</td><td>Yes</td><td>The absolute (dot-joined) field name from the AcroForm.</td></tr>
|
||
<tr><td><code>type</code></td><td>string</td><td>Yes</td><td>Field type: <code>"text"</code>, <code>"button"</code>, <code>"choice"</code>, or <code>"signature"</code>.</td></tr>
|
||
<tr><td><code>value</code></td><td>varies</td><td>Yes</td><td>The current value (structure varies by <code>type</code>).</td></tr>
|
||
<tr><td><code>default</code></td><td>varies</td><td>No</td><td>The default value (<code>/DV</code> entry).</td></tr>
|
||
<tr><td><code>read_only</code></td><td>boolean</td><td>Yes</td><td>Whether this field is read-only (bit 1 of <code>/Ff</code> flags).</td></tr>
|
||
<tr><td><code>required</code></td><td>boolean</td><td>Yes</td><td>Whether this field is required (bit 2 of <code>/Ff</code> flags).</td></tr>
|
||
<tr><td><code>page_index</code></td><td>integer/null</td><td>No</td><td>Zero-based page index where this field’s widget appears.</td></tr>
|
||
<tr><td><code>rect</code></td><td>array/null</td><td>No</td><td>Bounding box in PDF user-space points.</td></tr>
|
||
<tr><td><code>multiline</code></td><td>boolean/null</td><td>No</td><td>Whether this text field supports multiple lines (text fields only).</td></tr>
|
||
<tr><td><code>max_length</code></td><td>integer/null</td><td>No</td><td>Maximum length for text fields (<code>/MaxLen</code> entry).</td></tr>
|
||
<tr><td><code>multi_select</code></td><td>boolean/null</td><td>No</td><td>Whether this choice field supports multiple selections.</td></tr>
|
||
<tr><td><code>options</code></td><td>array/null</td><td>No</td><td>Available options for choice fields (<code>[export_value, display_name]</code> pairs).</td></tr>
|
||
<tr><td><code>radio</code></td><td>boolean/null</td><td>No</td><td>Whether this button is a radio button (button fields only).</td></tr>
|
||
<tr><td><code>pushbutton</code></td><td>boolean/null</td><td>No</td><td>Whether this button is a pushbutton (button fields only).</td></tr>
|
||
<tr><td><code>selected</code></td><td>boolean/null</td><td>No</td><td>Selected state for button fields.</td></tr>
|
||
<tr><td><code>state_name</code></td><td>string/null</td><td>No</td><td>Appearance state name for button fields (e.g., <code>"Yes"</code>, <code>"Off"</code>).</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="signatures-phase-73"><a class="header" href="#signatures-phase-73">Signatures (Phase 7.3)</a></h2>
|
||
<p>Digital signatures extracted from signature fields.</p>
|
||
<pre><code class="language-json">{
|
||
"field_name": "employer_signature",
|
||
"signer_name": "Jane Corporation",
|
||
"signing_date": "2024-03-15T14:23:51Z",
|
||
"location": "New York, NY",
|
||
"reason": "Contract approval",
|
||
"sub_filter": "adbe.pkcs7.detached",
|
||
"byte_range": [0, 12345, 67890, 456],
|
||
"coverage_fraction": 0.95,
|
||
"validation_status": "not_checked"
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>field_name</code></td><td>string</td><td>Yes</td><td>The absolute (dot-joined) field name from the AcroForm.</td></tr>
|
||
<tr><td><code>signer_name</code></td><td>string</td><td>Yes</td><td>The signer’s name from the <code>/Name</code> entry. Empty string if absent.</td></tr>
|
||
<tr><td><code>validation_status</code></td><td>string</td><td>Yes</td><td>Validation status — always <code>"not_checked"</code> in v1. Future versions may add <code>"valid"</code>, <code>"invalid"</code>, <code>"indeterminate"</code>.</td></tr>
|
||
<tr><td><code>signing_date</code></td><td>string/null</td><td>No</td><td>The signing date as an ISO 8601 string (RFC 3339 format).</td></tr>
|
||
<tr><td><code>location</code></td><td>string/null</td><td>No</td><td>The location of signing from the <code>/Location</code> entry.</td></tr>
|
||
<tr><td><code>reason</code></td><td>string/null</td><td>No</td><td>The reason for signing from the <code>/Reason</code> entry.</td></tr>
|
||
<tr><td><code>sub_filter</code></td><td>string/null</td><td>No</td><td>The signature format/filter from the <code>/SubFilter</code> entry.</td></tr>
|
||
<tr><td><code>byte_range</code></td><td>array/null</td><td>No</td><td>The <code>/ByteRange</code> array defining which bytes of the file are signed.</td></tr>
|
||
<tr><td><code>coverage_fraction</code></td><td>number/null</td><td>No</td><td>Fraction of the file covered by the signature (0.0 to 1.0).</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="receipts-phase-68"><a class="header" href="#receipts-phase-68">Receipts (Phase 6.8)</a></h2>
|
||
<p>Visual citation receipts provide cryptographic proof that extracted text originated from a specific region in a specific PDF.</p>
|
||
<pre><code class="language-json">{
|
||
"pdf_fingerprint": "pdftract-v1:a7f3c8d9...",
|
||
"page_index": 14,
|
||
"bbox": [220.0, 412.0, 412.0, 432.0],
|
||
"content_hash": "sha256:9b21c4e5...",
|
||
"extraction_version": "1.0.0",
|
||
"svg_clip": null
|
||
}
|
||
</code></pre>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Field</th><th>Type</th><th>Required</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>pdf_fingerprint</code></td><td>string</td><td>Yes</td><td>Phase 1.7 fingerprint of the source PDF.</td></tr>
|
||
<tr><td><code>page_index</code></td><td>integer</td><td>Yes</td><td>Zero-based page index in the source PDF.</td></tr>
|
||
<tr><td><code>bbox</code></td><td>array</td><td>Yes</td><td>Bounding box in PDF user-space points.</td></tr>
|
||
<tr><td><code>content_hash</code></td><td>string</td><td>Yes</td><td>SHA-256 hash of the NFC-normalized text content. Format: <code>"sha256:" + hex(SHA-256)</code>.</td></tr>
|
||
<tr><td><code>extraction_version</code></td><td>string</td><td>Yes</td><td>The pdftract version that produced this receipt (semver string).</td></tr>
|
||
<tr><td><code>svg_clip</code></td><td>string/null</td><td>No</td><td>SVG clip rendering the glyphs (present only in SVG mode).</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="receipts-mode"><a class="header" href="#receipts-mode">Receipts Mode</a></h3>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Mode</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>off</code></td><td>No receipts generated (default).</td></tr>
|
||
<tr><td><code>lite</code></td><td>Minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.</td></tr>
|
||
<tr><td><code>svg</code></td><td>Extended receipts that include an SVG clip rendering the glyphs.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h2 id="phase-7-placeholders"><a class="header" href="#phase-7-placeholders">Phase 7 Placeholders</a></h2>
|
||
<p>The following fields are included in the schema for forward compatibility but are not yet populated in Phase 6. They will be populated in Phase 7:</p>
|
||
<ul>
|
||
<li><strong><code>pages[].annotations</code></strong> - Highlights, stamps, notes, links from <code>/Annots</code> (Phase 7)</li>
|
||
<li><strong><code>attachments</code></strong> - From <code>/EmbeddedFiles</code> name tree (Phase 7.5)</li>
|
||
<li><strong><code>links</code></strong> - Document-scoped URI and internal destination links (Phase 7.6)</li>
|
||
<li><strong><code>threads</code></strong> - Article thread chains (Phase 7.7)</li>
|
||
</ul>
|
||
<p>These fields are present in the schema as empty arrays or null values, allowing consumers to pre-allocate space for future data without breaking when Phase 7 features are added.</p>
|
||
<h2 id="diagnostics"><a class="header" href="#diagnostics">Diagnostics</a></h2>
|
||
<p>Diagnostic messages provide visibility into extraction quality and issues:</p>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Severity</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>WARN</code></td><td>Warning - extraction succeeded but with potential quality issues (e.g., low coverage suggesting scanned content).</td></tr>
|
||
<tr><td><code>ERROR</code></td><td>Error - extraction failed for a specific page or region.</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>Example diagnostics:</p>
|
||
<pre><code class="language-json">[
|
||
"WARN: page 3: low coverage (54%) - possible scanned content",
|
||
"ERROR: page 7: failed to extract - corrupt content stream"
|
||
]
|
||
</code></pre>
|
||
<h2 id="coordinate-system"><a class="header" href="#coordinate-system">Coordinate System</a></h2>
|
||
<p>All <code>bbox</code> values use PDF user-space coordinates:</p>
|
||
<ul>
|
||
<li><strong>Units:</strong> PDF points (1/72 inch, approximately 0.353 mm)</li>
|
||
<li><strong>Origin:</strong> Lower-left corner of the page (x=0, y=0)</li>
|
||
<li><strong>Format:</strong> <code>[x0, y0, x1, y1]</code> where (x0, y0) is bottom-left and (x1, y1) is top-right</li>
|
||
</ul>
|
||
<p>Example: For a US Letter page (8.5 × 11 inches):</p>
|
||
<ul>
|
||
<li>Width: 612 points (8.5 × 72)</li>
|
||
<li>Height: 792 points (11 × 72)</li>
|
||
<li>Full page bbox: <code>[0, 0, 612, 792]</code></li>
|
||
</ul>
|
||
<h2 id="schema-validation"><a class="header" href="#schema-validation">Schema Validation</a></h2>
|
||
<p>Per <a href="../plan/plan.html">INV-11</a>, all JSON output must validate against the schema. CI runs a schema validation step on every fixture:</p>
|
||
<pre><code class="language-bash"># Python validation example
|
||
pip install jsonschema
|
||
jsonschema -i output.json docs/schema/v1.0/pdftract.schema.json
|
||
</code></pre>
|
||
<h2 id="plan-references"><a class="header" href="#plan-references">Plan References</a></h2>
|
||
<ul>
|
||
<li><strong>Phase 6.1</strong> (lines 2018-2051): JSON output full schema implementation</li>
|
||
<li><strong>Phase 6.8</strong> (lines 2400+): Visual citation receipts</li>
|
||
<li><strong>Phase 7.3</strong> (lines 2750+): Digital signatures</li>
|
||
<li><strong>Phase 7.4</strong> (lines 2800+): Form fields</li>
|
||
<li><strong>INV-11</strong> (line 841): Schema validation invariant</li>
|
||
</ul>
|
||
<p>For the complete field-by-field rationale, see the <a href="../research/extraction-output-schema.html">extraction output schema research doc</a>.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="json-schema-reference-1"><a class="header" href="#json-schema-reference-1">JSON Schema Reference</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Complete JSON output format documentation.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="output-format"><a class="header" href="#output-format">Output Format</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the JSON schema for pdftract output.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="block-types"><a class="header" href="#block-types">Block Types</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the semantic block types (heading, paragraph, list, table, etc.).</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="metadata"><a class="header" href="#metadata">Metadata</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the document metadata fields.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="error-handling"><a class="header" href="#error-handling">Error Handling</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes how errors are reported in the JSON output.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="profiles"><a class="header" href="#profiles">Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Document-type-specific extraction profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="available-profiles"><a class="header" href="#available-profiles">Available Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Lists all available document profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="invoice-profile"><a class="header" href="#invoice-profile">invoice Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for invoice documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="receipt-profile"><a class="header" href="#receipt-profile">receipt Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for receipt documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="bank_statement-profile"><a class="header" href="#bank_statement-profile">bank_statement Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for bank statement documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="contract-profile"><a class="header" href="#contract-profile">contract Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for contract documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="legal_filing-profile"><a class="header" href="#legal_filing-profile">legal_filing Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for legal filing documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="form-profile"><a class="header" href="#form-profile">form Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for form documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="scientific_paper-profile"><a class="header" href="#scientific_paper-profile">scientific_paper Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for scientific paper documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="book_chapter-profile"><a class="header" href="#book_chapter-profile">book_chapter Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for book chapter documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="slide_deck-profile"><a class="header" href="#slide_deck-profile">slide_deck Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for slide deck documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="custom-profiles"><a class="header" href="#custom-profiles">Custom Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How to create and use custom extraction profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="sdk-quickstarts"><a class="header" href="#sdk-quickstarts">SDK Quickstarts</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Getting started guides for using pdftract from various programming languages.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="python-sdk"><a class="header" href="#python-sdk">Python SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Python.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="rust-sdk"><a class="header" href="#rust-sdk">Rust SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Rust.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="javascripttypescript-sdk"><a class="header" href="#javascripttypescript-sdk">JavaScript/TypeScript SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from JavaScript/TypeScript (Node.js).</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="go-sdk"><a class="header" href="#go-sdk">Go SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Go.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="advanced-topics"><a class="header" href="#advanced-topics">Advanced Topics</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Deep dives into pdftract’s internals and advanced configuration.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="ocr-configuration"><a class="header" href="#ocr-configuration">OCR Configuration</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Configuring Tesseract and OCR settings.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="font-encoding-recovery"><a class="header" href="#font-encoding-recovery">Font Encoding Recovery</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How pdftract recovers text from fonts with broken or missing ToUnicode mappings.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="structure-tree-extraction"><a class="header" href="#structure-tree-extraction">Structure Tree Extraction</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extracting logical structure from tagged PDFs.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="hybrid-routing"><a class="header" href="#hybrid-routing">Hybrid Routing</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How pdftract routes each page to the optimal extraction pipeline.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="provenance-and-confidence"><a class="header" href="#provenance-and-confidence">Provenance and Confidence</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Understanding bounding boxes, font metadata, and confidence scores.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="troubleshooting-1"><a class="header" href="#troubleshooting-1">Troubleshooting</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Debugging and performance tuning for pdftract.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="common-issues"><a class="header" href="#common-issues">Common Issues</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Solutions to common extraction problems.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="diagnostics-1"><a class="header" href="#diagnostics-1">Diagnostics</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract’s diagnostic features for debugging.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="performance-tuning"><a class="header" href="#performance-tuning">Performance Tuning</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Optimizing extraction speed and memory usage.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="faq"><a class="header" href="#faq">FAQ</a></h1>
|
||
<p>Frequently asked questions about pdftract.</p>
|
||
<h2 id="table-of-contents"><a class="header" href="#table-of-contents">Table of Contents</a></h2>
|
||
<ul>
|
||
<li><a href="#general">General</a>
|
||
<ul>
|
||
<li><a href="#what-is-pdftract">What is pdftract?</a></li>
|
||
<li><a href="#whats-the-difference-between-extract-and-extract_text">What’s the difference between extract and extract_text?</a></li>
|
||
<li><a href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></li>
|
||
<li><a href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="#installation-and-setup">Installation and Setup</a>
|
||
<ul>
|
||
<li><a href="#how-do-i-install-pdftract">How do I install pdftract?</a></li>
|
||
<li><a href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></li>
|
||
<li><a href="#what-are-the-system-requirements">What are the system requirements?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="#usage">Usage</a>
|
||
<ul>
|
||
<li><a href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></li>
|
||
<li><a href="#why-is-ocr-slow">Why is OCR slow?</a></li>
|
||
<li><a href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></li>
|
||
<li><a href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></li>
|
||
<li><a href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="#configuration">Configuration</a>
|
||
<ul>
|
||
<li><a href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></li>
|
||
<li><a href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></li>
|
||
<li><a href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></li>
|
||
<li><a href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="#output-and-formats">Output and Formats</a>
|
||
<ul>
|
||
<li><a href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></li>
|
||
<li><a href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></li>
|
||
<li><a href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></li>
|
||
<li><a href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="#troubleshooting-2">Troubleshooting</a>
|
||
<ul>
|
||
<li><a href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></li>
|
||
<li><a href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></li>
|
||
<li><a href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></li>
|
||
<li><a href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<hr>
|
||
<h2 id="general"><a class="header" href="#general">General</a></h2>
|
||
<h3 id="what-is-pdftract"><a class="header" href="#what-is-pdftract">What is pdftract?</a></h3>
|
||
<p>pdftract is a command-line tool and library for extracting text, structure, and content from PDF files. It combines vector text extraction with OCR fallback to handle both well-formed and problematic PDFs. pdftract is written in Rust and provides Python bindings for programmatic use.</p>
|
||
<p>See the <a href="#introduction">Introduction</a> for a complete overview.</p>
|
||
<h3 id="whats-the-difference-between-extract-and-extract_text"><a class="header" href="#whats-the-difference-between-extract-and-extract_text">What’s the difference between extract and extract_text?</a></h3>
|
||
<ul>
|
||
<li>
|
||
<p><strong><code>extract</code></strong>: The primary command that produces structured JSON output with blocks, spans, metadata, and provenance information. Use this when you need the full extraction with layout, reading order, and confidence scores.</p>
|
||
</li>
|
||
<li>
|
||
<p><strong><code>extract_text</code></strong>: A simplified command that outputs plain text only. Use this for quick text extraction when you don’t need the structured JSON output.</p>
|
||
</li>
|
||
</ul>
|
||
<p>Example:</p>
|
||
<pre><code class="language-bash"># Full structured extraction
|
||
pdftract extract document.pdf -o output.json
|
||
|
||
# Plain text only
|
||
pdftract extract_text document.pdf -o output.txt
|
||
</code></pre>
|
||
<h3 id="does-pdftract-execute-javascript-embedded-in-pdfs"><a class="header" href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></h3>
|
||
<p><strong>No.</strong> pdftract never executes JavaScript embedded in PDFs. JavaScript is detected during parsing for security analysis, but it is never executed. This design prevents malicious PDFs from exploiting JavaScript vulnerabilities.</p>
|
||
<p>If you need to analyze JavaScript in PDFs, pdftract can detect and report its presence, but execution must be done separately with appropriate sandboxing.</p>
|
||
<h3 id="how-do-i-cite-an-extracted-snippet"><a class="header" href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></h3>
|
||
<p>The JSON output from <code>pdftract extract</code> includes provenance information for each text block:</p>
|
||
<pre><code class="language-json">{
|
||
"blocks": [{
|
||
"spans": [{
|
||
"text": "Example snippet",
|
||
"bbox": [100.0, 200.0, 250.0, 215.0],
|
||
"page": 3,
|
||
"confidence": 0.98
|
||
}]
|
||
}],
|
||
"metadata": {
|
||
"path": "/path/to/document.pdf",
|
||
"fingerprint": "sha256:abc123...",
|
||
"extracted_at": "2026-05-25T12:00:00Z"
|
||
}
|
||
}
|
||
</code></pre>
|
||
<p>For academic citations, include:</p>
|
||
<ul>
|
||
<li>Document path and fingerprint</li>
|
||
<li>Page number (from the <code>page</code> field)</li>
|
||
<li>Extraction timestamp</li>
|
||
<li>The pdftract version used</li>
|
||
</ul>
|
||
<hr>
|
||
<h2 id="installation-and-setup"><a class="header" href="#installation-and-setup">Installation and Setup</a></h2>
|
||
<h3 id="how-do-i-install-pdftract"><a class="header" href="#how-do-i-install-pdftract">How do I install pdftract?</a></h3>
|
||
<p>See the <a href="#installation">Installation</a> guide for complete instructions. Quick summary:</p>
|
||
<p><strong>With cargo (Rust toolchain):</strong></p>
|
||
<pre><code class="language-bash">cargo install pdftract
|
||
</code></pre>
|
||
<p><strong>With pip (Python bindings):</strong></p>
|
||
<pre><code class="language-bash">pip install pdftract
|
||
</code></pre>
|
||
<p><strong>Pre-built binaries:</strong> Download from the <a href="https://github.com/your-org/pdftract/releases">releases page</a>.</p>
|
||
<h3 id="how-do-i-run-pdftract-behind-a-corporate-proxy"><a class="header" href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></h3>
|
||
<p>pdftract doesn’t have built-in proxy support, but you can use the HTTP serve mode with a reverse proxy:</p>
|
||
<ol>
|
||
<li>Start pdftract in serve mode:</li>
|
||
</ol>
|
||
<pre><code class="language-bash">pdftract serve --port 8080
|
||
</code></pre>
|
||
<ol start="2">
|
||
<li>
|
||
<p>Configure your reverse proxy (nginx, Apache, etc.) to handle authentication and SSL termination.</p>
|
||
</li>
|
||
<li>
|
||
<p>Access pdftract through your proxy endpoint.</p>
|
||
</li>
|
||
</ol>
|
||
<p>See <a href="../operations/serve-deployment.html">Advanced Topics: HTTP Serve</a> for deployment guidance.</p>
|
||
<h3 id="what-are-the-system-requirements"><a class="header" href="#what-are-the-system-requirements">What are the system requirements?</a></h3>
|
||
<ul>
|
||
<li><strong>OS</strong>: Linux, macOS, or Windows</li>
|
||
<li><strong>Rust</strong>: 1.70+ (if building from source)</li>
|
||
<li><strong>Python</strong>: 3.8+ (for Python bindings)</li>
|
||
<li><strong>OCR (optional)</strong>: Tesseract 4.0+ for OCR fallback</li>
|
||
<li><strong>Memory</strong>: 512 MB minimum for typical PDFs; more for large documents</li>
|
||
</ul>
|
||
<hr>
|
||
<h2 id="usage"><a class="header" href="#usage">Usage</a></h2>
|
||
<h3 id="why-is-my-pdf-returning-broken_vector"><a class="header" href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></h3>
|
||
<p>The <code>broken_vector</code> classification means the PDF’s text layer is unreliable or missing. Common causes:</p>
|
||
<ul>
|
||
<li><strong>Invisible text overlay</strong>: Text with rendering mode 3 (invisible) overlaid on a raster image</li>
|
||
<li><strong>Missing ToUnicode CMap</strong>: Font lacks character-to-Unicode mapping</li>
|
||
<li><strong>Encoding corruption</strong>: Character encodings don’t match the actual glyphs</li>
|
||
</ul>
|
||
<p><strong>Solution</strong>: pdftract automatically routes <code>broken_vector</code> pages to the OCR pipeline (Phase 5.5). If you see <code>broken_vector</code> without OCR output, check that OCR is enabled:</p>
|
||
<pre><code class="language-bash"># Verify OCR is available
|
||
pdftract doctor tesseract-langs
|
||
|
||
# Enable OCR explicitly if needed
|
||
pdftract extract document.pdf --enable-ocr
|
||
</code></pre>
|
||
<p>See <a href="#common-issues">Troubleshooting: Broken Vector</a> for more details.</p>
|
||
<h3 id="why-is-ocr-slow"><a class="header" href="#why-is-ocr-slow">Why is OCR slow?</a></h3>
|
||
<p>OCR performance depends on several factors:</p>
|
||
<ul>
|
||
<li><strong>Image resolution</strong>: Higher DPI images take longer to process</li>
|
||
<li><strong>Tesseract version</strong>: Version 4.0+ is significantly faster than 3.x</li>
|
||
<li><strong>Language data</strong>: Additional language packs increase processing time</li>
|
||
<li><strong>Hardware</strong>: CPU-bound; more cores help with batch processing</li>
|
||
</ul>
|
||
<p><strong>To speed up OCR:</strong></p>
|
||
<pre><code class="language-bash"># Reduce DPI (trade-off: accuracy)
|
||
pdftract extract document.pdf --ocr-dpi 200
|
||
|
||
# Use fewer languages
|
||
pdftract extract document.pdf --ocr-lang eng
|
||
|
||
# Disable OCR for vector-only PDFs
|
||
pdftract extract document.pdf --disable-ocr
|
||
</code></pre>
|
||
<h3 id="how-do-i-extract-text-from-a-specific-page-range"><a class="header" href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></h3>
|
||
<p>Use the <code>--pages</code> flag:</p>
|
||
<pre><code class="language-bash"># Single page
|
||
pdftract extract document.pdf --pages 5
|
||
|
||
# Range
|
||
pdftract extract document.pdf --pages 1-10
|
||
|
||
# Multiple ranges
|
||
pdftract extract document.pdf --pages 1-5,10,15-20
|
||
|
||
# All pages from page 5 onward
|
||
pdftract extract document.pdf --pages 5-
|
||
</code></pre>
|
||
<h3 id="how-do-i-extract-images-from-a-pdf"><a class="header" href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></h3>
|
||
<p>pdftract automatically detects and records image XObjects during content stream processing. The output JSON includes image metadata:</p>
|
||
<pre><code class="language-json">{
|
||
"images": [{
|
||
"bbox": [100.0, 200.0, 400.0, 500.0],
|
||
"xobject_ref": "5 0 R",
|
||
"name": "Im1"
|
||
}]
|
||
}
|
||
</code></pre>
|
||
<p>For actual image extraction, use the <code>serve</code> mode with the <code>/images</code> endpoint or write a custom script using the Python SDK.</p>
|
||
<h3 id="can-i-process-multiple-pdfs-at-once"><a class="header" href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></h3>
|
||
<p>Yes, use shell wildcards or write a batch script:</p>
|
||
<pre><code class="language-bash"># Process all PDFs in a directory
|
||
for file in *.pdf; do
|
||
pdftract extract "$file" -o "output/$(basename "$file" .json)"
|
||
done
|
||
|
||
# With parallel processing (GNU parallel)
|
||
ls *.pdf | parallel -j 4 pdftract extract {} -o output/{/.}.json
|
||
</code></pre>
|
||
<hr>
|
||
<h2 id="configuration"><a class="header" href="#configuration">Configuration</a></h2>
|
||
<h3 id="how-do-i-add-a-custom-profile"><a class="header" href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></h3>
|
||
<p>Create a YAML file defining your profile:</p>
|
||
<pre><code class="language-yaml"># custom-profile.yaml
|
||
name: my_custom
|
||
description: "Custom extraction profile"
|
||
|
||
extraction:
|
||
preserve_tables: true
|
||
preserve_columns: true
|
||
ocr_fallback: true
|
||
|
||
output:
|
||
format: json
|
||
include_provenance: true
|
||
confidence_threshold: 0.7
|
||
</code></pre>
|
||
<p>Then use it:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --profile custom-profile.yaml
|
||
</code></pre>
|
||
<p>See <a href="#custom-profiles">Custom Profiles</a> for complete documentation.</p>
|
||
<h3 id="how-do-i-adjust-ocr-accuracy"><a class="header" href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></h3>
|
||
<p>Adjust Tesseract parameters via environment variables or the OCR configuration:</p>
|
||
<pre><code class="language-bash"># Set OCR engine mode
|
||
export TESSERACT_OEM=1 # LSTM only
|
||
export TESSERACT_PSM=6 # Assume single column block of text
|
||
|
||
# Adjust page segmentation mode
|
||
pdftract extract document.pdf --tesseract-psm 6
|
||
</code></pre>
|
||
<p>Higher accuracy settings may slow down processing. See <a href="#ocr-configuration">OCR Configuration</a> for details.</p>
|
||
<h3 id="how-do-i-disable-ocr-for-faster-processing"><a class="header" href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></h3>
|
||
<p>If you know your PDFs have reliable text layers:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --disable-ocr
|
||
</code></pre>
|
||
<p>Or set a confidence threshold to skip low-confidence text:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.9
|
||
</code></pre>
|
||
<h3 id="what-are-confidence-scores-and-how-do-i-use-them"><a class="header" href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></h3>
|
||
<p>Each text span has a <code>confidence</code> score (0.0 to 1.0):</p>
|
||
<ul>
|
||
<li><strong>1.0</strong>: High confidence (ToUnicode CMap lookup succeeded)</li>
|
||
<li><strong>0.3</strong>: Medium confidence (encoding + AGL fallback)</li>
|
||
<li><strong>0.0</strong>: No confidence (PositionHint mode or failed resolution)</li>
|
||
</ul>
|
||
<p>Filter by confidence:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.5
|
||
</code></pre>
|
||
<p>Or filter in post-processing using jq:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf | jq '.blocks[].spans[] | select(.confidence > 0.5)'
|
||
</code></pre>
|
||
<hr>
|
||
<h2 id="output-and-formats"><a class="header" href="#output-and-formats">Output and Formats</a></h2>
|
||
<h3 id="how-do-i-get-output-in-markdown-format"><a class="header" href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></h3>
|
||
<p>Use the <code>--format</code> flag:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --format markdown -o output.md
|
||
</code></pre>
|
||
<p>The Markdown output preserves headings, lists, tables, and code blocks where detected.</p>
|
||
<h3 id="how-do-i-preserve-table-structure"><a class="header" href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></h3>
|
||
<p>pdftract includes table detection (Phase 4.2). Ensure table preservation is enabled:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --preserve-tables
|
||
</code></pre>
|
||
<p>Tables are output with structured cell information:</p>
|
||
<pre><code class="language-json">{
|
||
"type": "table",
|
||
"rows": 3,
|
||
"columns": 4,
|
||
"cells": [...]
|
||
}
|
||
</code></pre>
|
||
<h3 id="can-i-extract-metadata-from-pdfs"><a class="header" href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></h3>
|
||
<p>Yes, metadata is automatically extracted and included in the output:</p>
|
||
<pre><code class="language-json">{
|
||
"metadata": {
|
||
"title": "Document Title",
|
||
"author": "Author Name",
|
||
"subject": "Subject",
|
||
"keywords": ["keyword1", "keyword2"],
|
||
"creator": "Application",
|
||
"producer": "PDF Producer",
|
||
"creation_date": "2026-01-01T00:00:00Z",
|
||
"modified_date": "2026-05-25T12:00:00Z"
|
||
}
|
||
}
|
||
</code></pre>
|
||
<h3 id="how-do-i-handle-password-protected-pdfs"><a class="header" href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></h3>
|
||
<p>Provide the password via the <code>--password</code> flag:</p>
|
||
<pre><code class="language-bash">pdftract extract document.pdf --password secret123
|
||
</code></pre>
|
||
<p>For security, avoid passing passwords on the command line in production. Use environment variables or a config file:</p>
|
||
<pre><code class="language-bash">export PDFTRACT_PASSWORD=secret123
|
||
pdftract extract document.pdf
|
||
</code></pre>
|
||
<hr>
|
||
<h2 id="troubleshooting-2"><a class="header" href="#troubleshooting-2">Troubleshooting</a></h2>
|
||
<h3 id="why-is-extraction-failing-with-an-error"><a class="header" href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></h3>
|
||
<p>Check the error message and consult the <a href="troubleshooting/README.html">Troubleshooting Guide</a>. Common issues:</p>
|
||
<ul>
|
||
<li><strong>Encrypted PDFs</strong>: Use <code>--password</code> to decrypt</li>
|
||
<li><strong>Corrupted PDFs</strong>: pdftract attempts recovery; check diagnostics</li>
|
||
<li><strong>Missing dependencies</strong>: Verify Tesseract and language packs are installed</li>
|
||
</ul>
|
||
<p>Run diagnostics:</p>
|
||
<pre><code class="language-bash">pdftract doctor
|
||
</code></pre>
|
||
<h3 id="why-is-my-output-empty-or-incomplete"><a class="header" href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></h3>
|
||
<p>Possible causes:</p>
|
||
<ol>
|
||
<li><strong>No text layer</strong>: PDF may be image-only. Enable OCR.</li>
|
||
<li><strong>Encoding issues</strong>: Check diagnostics for <code>FONT_GLYPH_UNMAPPED</code> warnings</li>
|
||
<li><strong>Page range issue</strong>: Verify your <code>--pages</code> argument</li>
|
||
<li><strong>Confidence filter</strong>: Lower <code>--min-confidence</code> if set too high</li>
|
||
</ol>
|
||
<p>Check diagnostics output:</p>
|
||
<pre><code class="language-bash">pdftract extract document.json --verbose
|
||
</code></pre>
|
||
<h3 id="how-do-i-debug-extraction-issues"><a class="header" href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></h3>
|
||
<p>Enable verbose output and diagnostics:</p>
|
||
<pre><code class="language-bash"># Full diagnostic output
|
||
pdftract extract document.pdf --verbose --diagnostics
|
||
|
||
# Save diagnostics for analysis
|
||
pdftract extract document.pdf --diagnostics -o diagnostics.json
|
||
</code></pre>
|
||
<p>Common diagnostic codes:</p>
|
||
<ul>
|
||
<li><code>FONT_GLYPH_UNMAPPED</code>: Glyph couldn’t be mapped to Unicode</li>
|
||
<li><code>STREAM_DECODE_ERROR</code>: Stream decompression failed</li>
|
||
<li><code>STRUCT_INVALID_TYPE</code>: Unexpected object type</li>
|
||
</ul>
|
||
<p>See <a href="#diagnostics-1">Diagnostics Reference</a> for a complete list.</p>
|
||
<h3 id="why-does-extraction-use-so-much-memory"><a class="header" href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></h3>
|
||
<p>Memory usage depends on:</p>
|
||
<ul>
|
||
<li><strong>PDF size</strong>: Larger PDFs with many images use more memory</li>
|
||
<li><strong>OCR</strong>: Tesseract loads image data into memory</li>
|
||
<li><strong>Output buffering</strong>: Large JSON outputs are buffered in memory</li>
|
||
</ul>
|
||
<p><strong>To reduce memory usage:</strong></p>
|
||
<pre><code class="language-bash"># Process page-by-page
|
||
for page in {1..100}; do
|
||
pdftract extract document.pdf --pages $page -o "page-$page.json"
|
||
done
|
||
|
||
# Disable OCR if not needed
|
||
pdftract extract document.pdf --disable-ocr
|
||
|
||
# Stream output (if supported)
|
||
pdftract extract document.pdf --stream-output
|
||
</code></pre>
|
||
<hr>
|
||
<h2 id="still-have-questions"><a class="header" href="#still-have-questions">Still have questions?</a></h2>
|
||
<ul>
|
||
<li>Check the <a href="troubleshooting/README.html">Troubleshooting Guide</a></li>
|
||
<li>Review the <a href="cli/README.html">CLI Reference</a></li>
|
||
<li>Open an issue on <a href="https://github.com/your-org/pdftract/issues">GitHub</a></li>
|
||
</ul>
|
||
|
||
</main>
|
||
|
||
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
<!-- Mobile navigation buttons -->
|
||
|
||
|
||
<div style="clear: both"></div>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
|
||
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
||
</nav>
|
||
|
||
</div>
|
||
|
||
<template id=fa-eye><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M288 32c-80.8 0-145.5 36.8-192.6 80.6C48.6 156 17.3 208 2.5 243.7c-3.3 7.9-3.3 16.7 0 24.6C17.3 304 48.6 356 95.4 399.4C142.5 443.2 207.2 480 288 480s145.5-36.8 192.6-80.6c46.8-43.5 78.1-95.4 93-131.1c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C433.5 68.8 368.8 32 288 32zM432 256c0 79.5-64.5 144-144 144s-144-64.5-144-144s64.5-144 144-144s144 64.5 144 144zM288 192c0 35.3-28.7 64-64 64c-11.5 0-22.3-3-31.6-8.4c-.2 2.8-.4 5.5-.4 8.4c0 53 43 96 96 96s96-43 96-96s-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6z"/></svg></span></template>
|
||
<template id=fa-eye-slash><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M38.8 5.1C28.4-3.1 13.3-1.2 5.1 9.2S-1.2 34.7 9.2 42.9l592 464c10.4 8.2 25.5 6.3 33.7-4.1s6.3-25.5-4.1-33.7L525.6 386.7c39.6-40.6 66.4-86.1 79.9-118.4c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C465.5 68.8 400.8 32 320 32c-68.2 0-125 26.3-169.3 60.8L38.8 5.1zM223.1 149.5C248.6 126.2 282.7 112 320 112c79.5 0 144 64.5 144 144c0 24.9-6.3 48.3-17.4 68.7L408 294.5c5.2-11.8 8-24.8 8-38.5c0-53-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6c0 10.2-2.4 19.8-6.6 28.3l-90.3-70.8zm223.1 298L373 389.9c-16.4 6.5-34.3 10.1-53 10.1c-79.5 0-144-64.5-144-144c0-6.9 .5-13.6 1.4-20.2L83.1 161.5C60.3 191.2 44 220.8 34.5 243.7c-3.3 7.9-3.3 16.7 0 24.6c14.9 35.7 46.2 87.7 93 131.1C174.5 443.2 239.2 480 320 480c47.8 0 89.9-12.9 126.2-32.5z"/></svg></span></template>
|
||
<template id=fa-copy><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M502.6 70.63l-61.25-61.25C435.4 3.371 427.2 0 418.7 0H255.1c-35.35 0-64 28.66-64 64l.0195 256C192 355.4 220.7 384 256 384h192c35.2 0 64-28.8 64-64V93.25C512 84.77 508.6 76.63 502.6 70.63zM464 320c0 8.836-7.164 16-16 16H255.1c-8.838 0-16-7.164-16-16L239.1 64.13c0-8.836 7.164-16 16-16h128L384 96c0 17.67 14.33 32 32 32h47.1V320zM272 448c0 8.836-7.164 16-16 16H63.1c-8.838 0-16-7.164-16-16L47.98 192.1c0-8.836 7.164-16 16-16H160V128H63.99c-35.35 0-64 28.65-64 64l.0098 256C.002 483.3 28.66 512 64 512h192c35.2 0 64-28.8 64-64v-32h-47.1L272 448z"/></svg></span></template>
|
||
<template id=fa-play><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M73 39c-14.8-9.1-33.4-9.4-48.5-.9S0 62.6 0 80V432c0 17.4 9.4 33.4 24.5 41.9s33.7 8.1 48.5-.9L361 297c14.3-8.7 23-24.2 23-41s-8.7-32.2-23-41L73 39z"/></svg></span></template>
|
||
<template id=fa-clock-rotate-left><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M75 75L41 41C25.9 25.9 0 36.6 0 57.9V168c0 13.3 10.7 24 24 24H134.1c21.4 0 32.1-25.9 17-41l-30.8-30.8C155 85.5 203 64 256 64c106 0 192 86 192 192s-86 192-192 192c-40.8 0-78.6-12.7-109.7-34.4c-14.5-10.1-34.4-6.6-44.6 7.9s-6.6 34.4 7.9 44.6C151.2 495 201.7 512 256 512c141.4 0 256-114.6 256-256S397.4 0 256 0C185.3 0 121.3 28.7 75 75zm181 53c-13.3 0-24 10.7-24 24V256c0 6.4 2.5 12.5 7 17l72 72c9.4 9.4 24.6 9.4 33.9 0s9.4-24.6 0-33.9l-65-65V152c0-13.3-10.7-24-24-24z"/></svg></span></template>
|
||
|
||
|
||
|
||
<script>
|
||
window.playground_copyable = true;
|
||
</script>
|
||
|
||
|
||
<script src="elasticlunr-ef4e11c1.min.js"></script>
|
||
<script src="mark-09e88c2c.min.js"></script>
|
||
<script src="searcher-c2a407aa.js"></script>
|
||
|
||
<script src="clipboard-1626706a.min.js"></script>
|
||
<script src="highlight-abc7f01d.js"></script>
|
||
<script src="book-a0b12cfe.js"></script>
|
||
|
||
<!-- Custom JS scripts -->
|
||
|
||
<script>
|
||
window.addEventListener('load', function() {
|
||
window.setTimeout(window.print, 100);
|
||
});
|
||
</script>
|
||
|
||
|
||
</div>
|
||
</body>
|
||
</html>
|