- book.toml with title, authors, build directory, edit-url-template - src/SUMMARY.md with complete TOC for all planned sections - src/introduction.md: what pdftract does and doesn't do (Non-Goals) - src/installation.md: cargo, pip, Homebrew, Docker; KU-12 caveat verbatim - src/quickstart.md: five-minute walkthrough with executable commands - 39 draft placeholder files for CLI reference, schema, profiles, SDKs, advanced topics, troubleshooting, FAQ mdbook build completes cleanly with zero warnings (linkcheck optional). See notes/pdftract-1g87.md for verification details. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
754 lines
50 KiB
HTML
754 lines
50 KiB
HTML
<!DOCTYPE HTML>
|
||
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
<head>
|
||
<!-- Book generated using mdBook -->
|
||
<meta charset="UTF-8">
|
||
<title>pdftract User Documentation</title>
|
||
<meta name="robots" content="noindex">
|
||
|
||
|
||
<!-- Custom HTML head -->
|
||
|
||
<meta name="description" content="">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
<meta name="theme-color" content="#ffffff">
|
||
|
||
<link rel="icon" href="favicon-de23e50b.svg">
|
||
<link rel="shortcut icon" href="favicon-8114d1fc.png">
|
||
<link rel="stylesheet" href="css/variables-8adf115d.css">
|
||
<link rel="stylesheet" href="css/general-2459343d.css">
|
||
<link rel="stylesheet" href="css/chrome-ae938929.css">
|
||
<link rel="stylesheet" href="css/print-9e4910d8.css" media="print">
|
||
|
||
<!-- Fonts -->
|
||
<link rel="stylesheet" href="fonts/fonts-9644e21d.css">
|
||
|
||
<!-- Highlight.js Stylesheets -->
|
||
<link rel="stylesheet" id="mdbook-highlight-css" href="highlight-493f70e1.css">
|
||
<link rel="stylesheet" id="mdbook-tomorrow-night-css" href="tomorrow-night-4c0ae647.css">
|
||
<link rel="stylesheet" id="mdbook-ayu-highlight-css" href="ayu-highlight-3fdfc3ac.css">
|
||
|
||
<!-- Custom theme stylesheets -->
|
||
|
||
|
||
<!-- Provide site root and default themes to javascript -->
|
||
<script>
|
||
const path_to_root = "";
|
||
const default_light_theme = "light";
|
||
const default_dark_theme = "navy";
|
||
window.path_to_searchindex_js = "searchindex-4b797d79.js";
|
||
</script>
|
||
<!-- Start loading toc.js asap -->
|
||
<script src="toc-9eb73786.js"></script>
|
||
</head>
|
||
<body>
|
||
<div id="mdbook-help-container">
|
||
<div id="mdbook-help-popup">
|
||
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
<div>
|
||
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
<p>Press <kbd>?</kbd> to show this help</p>
|
||
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div id="mdbook-body-container">
|
||
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
<script>
|
||
try {
|
||
let theme = localStorage.getItem('mdbook-theme');
|
||
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
||
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
}
|
||
|
||
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
}
|
||
} catch (e) { }
|
||
</script>
|
||
|
||
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
<script>
|
||
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
let theme;
|
||
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
const html = document.documentElement;
|
||
html.classList.remove('light')
|
||
html.classList.add(theme);
|
||
html.classList.add("js");
|
||
</script>
|
||
|
||
<input type="checkbox" id="mdbook-sidebar-toggle-anchor" class="hidden">
|
||
|
||
<!-- Hide / unhide sidebar before it is displayed -->
|
||
<script>
|
||
let sidebar = null;
|
||
const sidebar_toggle = document.getElementById("mdbook-sidebar-toggle-anchor");
|
||
if (document.body.clientWidth >= 1080) {
|
||
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
sidebar = sidebar || 'visible';
|
||
} else {
|
||
sidebar = 'hidden';
|
||
sidebar_toggle.checked = false;
|
||
}
|
||
if (sidebar === 'visible') {
|
||
sidebar_toggle.checked = true;
|
||
} else {
|
||
html.classList.remove('sidebar-visible');
|
||
}
|
||
</script>
|
||
|
||
<nav id="mdbook-sidebar" class="sidebar" aria-label="Table of contents">
|
||
<!-- populated by js -->
|
||
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
<noscript>
|
||
<iframe class="sidebar-iframe-outer" src="toc.html"></iframe>
|
||
</noscript>
|
||
<div id="mdbook-sidebar-resize-handle" class="sidebar-resize-handle">
|
||
<div class="sidebar-resize-indicator"></div>
|
||
</div>
|
||
</nav>
|
||
|
||
<div id="mdbook-page-wrapper" class="page-wrapper">
|
||
|
||
<div class="page">
|
||
<div id="mdbook-menu-bar-hover-placeholder"></div>
|
||
<div id="mdbook-menu-bar" class="menu-bar sticky">
|
||
<div class="left-buttons">
|
||
<label id="mdbook-sidebar-toggle" class="icon-button" for="mdbook-sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="mdbook-sidebar">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"/></svg></span>
|
||
</label>
|
||
<button id="mdbook-theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="mdbook-theme-list">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M371.3 367.1c27.3-3.9 51.9-19.4 67.2-42.9L600.2 74.1c12.6-19.5 9.4-45.3-7.6-61.2S549.7-4.4 531.1 9.6L294.4 187.2c-24 18-38.2 46.1-38.4 76.1L371.3 367.1zm-19.6 25.4l-116-104.4C175.9 290.3 128 339.6 128 400c0 3.9 .2 7.8 .6 11.6c1.8 17.5-10.2 36.4-27.8 36.4H96c-17.7 0-32 14.3-32 32s14.3 32 32 32H240c61.9 0 112-50.1 112-112c0-2.5-.1-5-.2-7.5z"/></svg></span>
|
||
</button>
|
||
<ul id="mdbook-theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-default_theme">Auto</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-light">Light</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-rust">Rust</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-coal">Coal</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-navy">Navy</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-ayu">Ayu</button></li>
|
||
</ul>
|
||
<button id="mdbook-search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="mdbook-searchbar">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M416 208c0 45.9-14.9 88.3-40 122.7L502.6 457.4c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L330.7 376c-34.4 25.2-76.8 40-122.7 40C93.1 416 0 322.9 0 208S93.1 0 208 0S416 93.1 416 208zM208 352c79.5 0 144-64.5 144-144s-64.5-144-144-144S64 128.5 64 208s64.5 144 144 144z"/></svg></span>
|
||
</button>
|
||
</div>
|
||
|
||
<h1 class="menu-title">pdftract User Documentation</h1>
|
||
|
||
<div class="right-buttons">
|
||
<a href="print.html" title="Print this book" aria-label="Print this book">
|
||
<span class=fa-svg id="print-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M128 0C92.7 0 64 28.7 64 64v96h64V64H354.7L384 93.3V160h64V93.3c0-17-6.7-33.3-18.7-45.3L400 18.7C388 6.7 371.7 0 354.7 0H128zM384 352v32 64H128V384 368 352H384zm64 32h32c17.7 0 32-14.3 32-32V256c0-35.3-28.7-64-64-64H64c-35.3 0-64 28.7-64 64v96c0 17.7 14.3 32 32 32H64v64c0 35.3 28.7 64 64 64H384c35.3 0 64-28.7 64-64V384zm-16-88c-13.3 0-24-10.7-24-24s10.7-24 24-24s24 10.7 24 24s-10.7 24-24 24z"/></svg></span>
|
||
</a>
|
||
<a href="https://github.com/jedarden/pdftract" title="Git repository" aria-label="Git repository">
|
||
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg></span>
|
||
</a>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<div id="mdbook-search-wrapper" class="hidden">
|
||
<form id="mdbook-searchbar-outer" class="searchbar-outer">
|
||
<div class="search-wrapper">
|
||
<input type="search" id="mdbook-searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="mdbook-searchresults-outer" aria-describedby="searchresults-header">
|
||
<div class="spinner-wrapper">
|
||
<span class=fa-svg id="fa-spin"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M304 48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zm0 416c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM48 304c26.5 0 48-21.5 48-48s-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48zm464-48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM142.9 437c18.7-18.7 18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zm0-294.2c18.7-18.7 18.7-49.1 0-67.9S93.7 56.2 75 75s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zM369.1 437c18.7 18.7 49.1 18.7 67.9 0s18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9z"/></svg></span>
|
||
</div>
|
||
</div>
|
||
</form>
|
||
<div id="mdbook-searchresults-outer" class="searchresults-outer hidden">
|
||
<div id="mdbook-searchresults-header" class="searchresults-header"></div>
|
||
<ul id="mdbook-searchresults">
|
||
</ul>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
<script>
|
||
document.getElementById('mdbook-sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
document.getElementById('mdbook-sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
Array.from(document.querySelectorAll('#mdbook-sidebar a')).forEach(function(link) {
|
||
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
});
|
||
</script>
|
||
|
||
<div id="mdbook-content" class="content">
|
||
<main>
|
||
<h1 id="introduction"><a class="header" href="#introduction">Introduction</a></h1>
|
||
<h2 id="what-pdftract-does"><a class="header" href="#what-pdftract-does">What pdftract Does</a></h2>
|
||
<p>pdftract is a PDF text extraction library that gets the hard parts right. Unlike naive PDF parsers that dump text in the order it appears in the PDF file (which is rarely the correct reading order), pdftract understands document layout and recovers the logical structure that humans perceive when reading a page.</p>
|
||
<h3 id="core-features"><a class="header" href="#core-features">Core Features</a></h3>
|
||
<p><strong>Correct reading order</strong> — Layout regions are segmented and sequenced before text is emitted, handling multi-column pages, sidebars, footnotes, and mixed-layout documents without relying on PDF operator order. pdftract groups text into semantic blocks (headings, paragraphs, lists, tables) and outputs them in the order a human would read.</p>
|
||
<p><strong>Font encoding recovery</strong> — When <code>ToUnicode</code> CMaps are absent, wrong, or incomplete (a common problem in PDFs generated by legacy tools), pdftract works through a layered recovery pipeline: glyph name lookup via the Adobe Glyph List, font fingerprinting against known metrics and embedded checksums, and glyph outline shape matching. This means you get readable Unicode text even from broken PDFs.</p>
|
||
<p><strong>Structure tree extraction</strong> — PDF/UA and PDF/A documents encode their logical structure (headings, paragraphs, lists, tables, reading order) in a <code>StructTree</code>. pdftract reads this directly when present, producing accurate semantic output at no extra cost. Tagged PDFs yield near-perfect extraction.</p>
|
||
<p><strong>Per-page hybrid routing</strong> — Each page is independently classified and routed to the appropriate pipeline: vector text extraction (for pages with embedded fonts), full OCR (for scanned pages), or assisted OCR where vector hints improve raster accuracy. This hybrid approach optimizes for both accuracy and speed.</p>
|
||
<p><strong>Structured output with provenance</strong> — The primary output is JSON carrying per-span bounding boxes, font name, size, and confidence score alongside the extracted text, not a flat string dump. You get rich metadata that enables downstream processing: layout analysis, font-aware styling, highlight extraction, and confidence-based filtering.</p>
|
||
<h3 id="what-you-can-extract"><a class="header" href="#what-you-can-extract">What You Can Extract</a></h3>
|
||
<ul>
|
||
<li><strong>Text</strong> — Plain text or structured JSON with per-character provenance</li>
|
||
<li><strong>Layout</strong> — Bounding boxes for blocks, lines, and spans</li>
|
||
<li><strong>Metadata</strong> — Title, author, creation date, page count, PDF version</li>
|
||
<li><strong>Structure</strong> — Headings, paragraphs, lists, tables (when present in the PDF)</li>
|
||
<li><strong>Annotations</strong> — Comments, highlights, form fields (Phase 7)</li>
|
||
</ul>
|
||
<h3 id="what-pdftract-does-not-do"><a class="header" href="#what-pdftract-does-not-do">What pdftract Does Not Do</a></h3>
|
||
<p>pdftract is deliberately scoped. The following features are <strong>not</strong> in scope for v1.0.0:</p>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Non-goal</th><th>Alternative</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td>PDF authoring or writing</td><td><code>lopdf</code>, <code>pdfium-render</code>, <code>printpdf</code></td></tr>
|
||
<tr><td>Full PDF rendering / printing</td><td>PDFium, MuPDF, Poppler</td></tr>
|
||
<tr><td>Cryptographic signature validation</td><td><code>openssl smime</code>, dedicated PKI libraries</td></tr>
|
||
<tr><td>Translation of extracted text</td><td>LibreTranslate, DeepL, Argos</td></tr>
|
||
<tr><td>Summarization</td><td>LLM tools via the MCP server integration</td></tr>
|
||
<tr><td>OCR engine training</td><td>Tesseract’s <code>tesstrain</code> tooling</td></tr>
|
||
<tr><td>Filling out PDF forms</td><td>Form-filling tools with authoring support</td></tr>
|
||
<tr><td>Watermark removal</td><td>Detected and excluded from output, not removed from PDF</td></tr>
|
||
<tr><td>Password cracking</td><td><code>pdfcrack</code>, <code>john</code></td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>For the full rationale and scope-lock doctrine, see the <a href="../../plan/plan.html#non-goals">Non-Goals section</a> in the project plan.</p>
|
||
<h2 id="supported-pdf-features"><a class="header" href="#supported-pdf-features">Supported PDF Features</a></h2>
|
||
<p>pdftract supports PDF 1.4 through PDF 2.0, with varying levels of feature coverage:</p>
|
||
<ul>
|
||
<li><strong>Text extraction</strong> — Full support for Type 1, TrueType, OpenType, and CID-keyed fonts</li>
|
||
<li><strong>Compression</strong> — All standard filters (FlateDecode, ASCIIHex, ASCII85, RunLength, CCITT, DCT)</li>
|
||
<li><strong>Encryption</strong> — RC4 40-bit, RC4 128-bit, AES-128, AES-256 (password required)</li>
|
||
<li><strong>Structure trees</strong> — PDF/UA logical structure reading</li>
|
||
<li><strong>Forms</strong> — AcroForm and XFA field extraction (read-only)</li>
|
||
<li><strong>Signatures</strong> — Signature metadata extraction (validation not performed)</li>
|
||
<li><strong>Attachments</strong> — File attachment extraction</li>
|
||
<li><strong>Articles</strong> — Thread extraction for logical reading flows</li>
|
||
</ul>
|
||
<p>See the <a href="advanced">Advanced Topics</a> section for deep dives into specific features.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="installation"><a class="header" href="#installation">Installation</a></h1>
|
||
<p>pdftract is distributed as a native binary, a Python package, and a Docker image. Choose the installation method that matches your workflow.</p>
|
||
<h2 id="install-via-cargo"><a class="header" href="#install-via-cargo">Install via Cargo</a></h2>
|
||
<pre><code class="language-bash">cargo install pdftract
|
||
</code></pre>
|
||
<p>This installs the <code>pdftract</code> binary in <code>~/.cargo/bin/</code>. Make sure <code>~/.cargo/bin</code> is in your <code>PATH</code>.</p>
|
||
<h3 id="pre-built-binaries"><a class="header" href="#pre-built-binaries">Pre-built Binaries</a></h3>
|
||
<p>Pre-built binaries are available from <a href="https://github.com/jedarden/pdftract/releases">GitHub Releases</a>. Download the archive for your platform, extract, and place the binary in your <code>PATH</code>.</p>
|
||
<h3 id="cargo-binstall"><a class="header" href="#cargo-binstall">Cargo Binstall</a></h3>
|
||
<p>For faster installation without compiling from source:</p>
|
||
<pre><code class="language-bash">cargo binstall pdftract
|
||
</code></pre>
|
||
<p>This downloads a pre-built binary from the GitHub Release instead of compiling locally.</p>
|
||
<h2 id="install-via-pip"><a class="header" href="#install-via-pip">Install via pip</a></h2>
|
||
<p>pdftract is distributed on PyPI as a native Python extension with PyO3 bindings.</p>
|
||
<pre><code class="language-bash">pip install pdftract
|
||
</code></pre>
|
||
<p>The Python package includes the same extraction engine as the CLI, accessible via a Python API. See <a href="#python-sdk">Python SDK</a> for usage.</p>
|
||
<h3 id="platform-wheels"><a class="header" href="#platform-wheels">Platform Wheels</a></h3>
|
||
<p>Wheels are available for:</p>
|
||
<ul>
|
||
<li>Linux <code>x86_64</code> (manylinux2014, musllinux)</li>
|
||
<li>macOS <code>x86_64</code> and <code>arm64</code></li>
|
||
<li>Windows <code>x86_64</code></li>
|
||
</ul>
|
||
<p>If no wheel is available for your platform, pip will fall back to building from source (requires Rust toolchain).</p>
|
||
<h2 id="install-via-homebrew"><a class="header" href="#install-via-homebrew">Install via Homebrew</a></h2>
|
||
<p><strong>Note:</strong> Homebrew formula is deferred to v1.1+. In the meantime, use <code>cargo install pdftract</code> or the Docker image.</p>
|
||
<p>See the <a href="../../plan/plan.html#non-goals">Non-Goals section</a> in the project plan for the rationale.</p>
|
||
<h2 id="install-via-docker"><a class="header" href="#install-via-docker">Install via Docker</a></h2>
|
||
<p>Docker images are available on GitHub Container Registry:</p>
|
||
<pre><code class="language-bash">docker pull ghcr.io/jedarden/pdftract:latest
|
||
docker run --rm -v $(pwd):/work ghcr.io/jedarden/pdftract:latest extract /work/document.pdf
|
||
</code></pre>
|
||
<h3 id="image-variants"><a class="header" href="#image-variants">Image Variants</a></h3>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Tag</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>latest</code></td><td>Default features (vector extraction, basic OCR)</td></tr>
|
||
<tr><td><code>ocr</code></td><td>Includes Tesseract for full OCR support</td></tr>
|
||
<tr><td><code>full</code></td><td>All features including PDFium for rasterization</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>Multi-arch manifests support <code>amd64</code> and <code>arm64</code> platforms.</p>
|
||
<h2 id="platform-support"><a class="header" href="#platform-support">Platform Support</a></h2>
|
||
<h3 id="supported-platforms"><a class="header" href="#supported-platforms">Supported Platforms</a></h3>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Platform</th><th>CI Status</th><th>Notes</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td>Linux <code>x86_64</code> (glibc)</td><td>Fully CI-tested</td><td>Primary development platform</td></tr>
|
||
<tr><td>Linux <code>x86_64</code> (musl)</td><td>Fully CI-tested</td><td>Alpine-compatible</td></tr>
|
||
<tr><td>Linux <code>arm64</code> (glibc)</td><td>Fully CI-tested</td><td>ARM64 servers (e.g., Graviton)</td></tr>
|
||
<tr><td>Linux <code>arm64</code> (musl)</td><td>Fully CI-tested</td><td>Alpine ARM64</td></tr>
|
||
<tr><td>macOS <code>x86_64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
<tr><td>macOS <code>arm64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
<tr><td>Windows <code>x86_64</code></td><td>Build-tested</td><td>See caveat below</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<h3 id="cross-platform-test-limitation-ku-12"><a class="header" href="#cross-platform-test-limitation-ku-12">Cross-Platform Test Limitation (KU-12)</a></h3>
|
||
<blockquote>
|
||
<p><strong>Linux is fully CI-tested; macOS and Windows are build-tested and manually smoke-tested per release.</strong></p>
|
||
</blockquote>
|
||
<p>Per project architecture decision ADR-009, the CI pipeline runs on Linux-only infrastructure (<code>iad-ci</code>). macOS and Windows binaries are <strong>built</strong> via cross-compilation but are never <strong>executed</strong> in automated CI. This is acknowledged as Known Unknown KU-12 with the following mitigation:</p>
|
||
<ul>
|
||
<li>A manual smoke-test runbook is executed by the release lead before each milestone against at least one physical macOS machine and one Windows VM</li>
|
||
<li>User bug reports for platform-specific issues are acknowledged within 48 hours and addressed in the next patch release</li>
|
||
<li>No claim of “tested on macOS/Windows” appears in CI status badges</li>
|
||
</ul>
|
||
<p>If you encounter a platform-specific issue on macOS or Windows, please file a bug report. The project is committed to fixing platform bugs promptly.</p>
|
||
<h3 id="minimum-rust-version"><a class="header" href="#minimum-rust-version">Minimum Rust Version</a></h3>
|
||
<p>If building from source, pdftract requires Rust 1.78 or later. The MSRV is pinned in <code>Cargo.toml</code> and tested on every PR.</p>
|
||
<h2 id="verifying-installation"><a class="header" href="#verifying-installation">Verifying Installation</a></h2>
|
||
<p>Run the following command to verify your installation:</p>
|
||
<pre><code class="language-bash">pdftract --version
|
||
</code></pre>
|
||
<p>You should see output like:</p>
|
||
<pre><code>pdftract 0.1.0
|
||
</code></pre>
|
||
<p>For the Python package:</p>
|
||
<pre><code class="language-bash">python -c "import pdftract; print(pdftract.__version__)"
|
||
</code></pre>
|
||
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
|
||
<p>Once installed, proceed to the <a href="#quickstart">Quickstart</a> for a five-minute walkthrough of pdftract’s core features.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="quickstart"><a class="header" href="#quickstart">Quickstart</a></h1>
|
||
<p>This five-minute walkthrough covers the core pdftract workflow: extract text from a PDF, inspect the structured JSON output, and try profile-based extraction.</p>
|
||
<h2 id="five-minute-walkthrough"><a class="header" href="#five-minute-walkthrough">Five-Minute Walkthrough</a></h2>
|
||
<h3 id="prerequisites"><a class="header" href="#prerequisites">Prerequisites</a></h3>
|
||
<ul>
|
||
<li>pdftract installed (see <a href="#installation">Installation</a>)</li>
|
||
<li>A PDF file to extract (any PDF will do)</li>
|
||
</ul>
|
||
<p>If you don’t have a PDF handy, you can use the sample fixtures from the pdftract repository:</p>
|
||
<pre><code class="language-bash">git clone https://github.com/jedarden/pdftract.git
|
||
cd pdftract
|
||
</code></pre>
|
||
<h3 id="extract-your-first-pdf"><a class="header" href="#extract-your-first-pdf">Extract Your First PDF</a></h3>
|
||
<p>The simplest extraction outputs plain text to stdout:</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf
|
||
</code></pre>
|
||
<p>For structured JSON output (default):</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf --output result.json
|
||
</code></pre>
|
||
<p>Or view JSON directly in your terminal (pipe to <code>jq</code> for pretty-printing):</p>
|
||
<pre><code class="language-bash">pdftract extract path/to/document.pdf | jq .
|
||
</code></pre>
|
||
<h3 id="inspect-the-output"><a class="header" href="#inspect-the-output">Inspect the Output</a></h3>
|
||
<p>The JSON output contains:</p>
|
||
<ul>
|
||
<li><strong><code>pages</code></strong> — Array of page objects, each with <code>blocks</code> and <code>spans</code></li>
|
||
<li><strong><code>blocks</code></strong> — Semantic elements (headings, paragraphs, lists) with reading order</li>
|
||
<li><strong><code>spans</code></strong> — Text fragments with bounding boxes, font metadata, and confidence scores</li>
|
||
<li><strong><code>metadata</code></strong> — Document title, author, page count, PDF version</li>
|
||
</ul>
|
||
<p>Example:</p>
|
||
<pre><code class="language-json">{
|
||
"pages": [
|
||
{
|
||
"page": 1,
|
||
"width": 612,
|
||
"height": 792,
|
||
"blocks": [
|
||
{
|
||
"kind": "heading",
|
||
"text": "Introduction",
|
||
"bbox": [72, 680, 400, 700],
|
||
"level": 1
|
||
},
|
||
{
|
||
"kind": "paragraph",
|
||
"text": "This is the first paragraph...",
|
||
"bbox": [72, 640, 540, 670]
|
||
}
|
||
],
|
||
"spans": [
|
||
{
|
||
"text": "Introduction",
|
||
"bbox": [72, 680, 400, 700],
|
||
"font": "Times-Bold",
|
||
"size": 14.0,
|
||
"confidence": 0.99
|
||
}
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"title": "Sample Document",
|
||
"author": "John Doe",
|
||
"page_count": 1,
|
||
"pdf_version": "1.4"
|
||
}
|
||
}
|
||
</code></pre>
|
||
<h3 id="try-auto-profile-mode"><a class="header" href="#try-auto-profile-mode">Try Auto-Profile Mode</a></h3>
|
||
<p>pdftract includes built-in profiles for common document types (invoices, receipts, contracts, etc.). Use <code>--auto</code> to automatically detect the profile:</p>
|
||
<pre><code class="language-bash">pdftract extract invoice.pdf --auto
|
||
</code></pre>
|
||
<p>The auto-detected profile is logged to stderr:</p>
|
||
<pre><code>[INFO] Detected profile: invoice
|
||
</code></pre>
|
||
<p>Profiles optimize extraction for specific document layouts:</p>
|
||
<ul>
|
||
<li><strong>invoice</strong> — Extract line items, totals, vendor info</li>
|
||
<li><strong>receipt</strong> — Extract merchant, date, line items, tax, total</li>
|
||
<li><strong>contract</strong> — Extract parties, effective date, clauses</li>
|
||
<li><strong>bank_statement</strong> — Extract account info, statement period, transactions</li>
|
||
</ul>
|
||
<p>See <a href="#available-profiles">Profiles</a> for the full list.</p>
|
||
<h3 id="batch-processing"><a class="header" href="#batch-processing">Batch Processing</a></h3>
|
||
<p>To extract multiple PDFs in a folder:</p>
|
||
<pre><code class="language-bash">pdftract extract *.pdf --output-dir results/
|
||
</code></pre>
|
||
<p>Each PDF produces a corresponding JSON file in <code>results/</code>:</p>
|
||
<pre><code>results/
|
||
invoice1.pdf.json
|
||
invoice2.pdf.json
|
||
receipt.pdf.json
|
||
</code></pre>
|
||
<p>For recursive folder processing, use the <code>grep</code> command to search across all PDFs:</p>
|
||
<pre><code class="language-bash">pdftract grep "search term" /path/to/folder
|
||
</code></pre>
|
||
<p>This outputs matching filenames and page numbers:</p>
|
||
<pre><code>invoice.pdf:3: "search term" found on page 3
|
||
receipt.pdf:1: "search term" found on page 1
|
||
</code></pre>
|
||
<h2 id="common-options"><a class="header" href="#common-options">Common Options</a></h2>
|
||
<div class="table-wrapper">
|
||
<table>
|
||
<thead>
|
||
<tr><th>Option</th><th>Description</th></tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr><td><code>--output FILE</code></td><td>Write output to file instead of stdout</td></tr>
|
||
<tr><td><code>--text</code></td><td>Output plain text instead of JSON</td></tr>
|
||
<tr><td><code>--output-dir DIR</code></td><td>Directory for batch output (with <code>*</code> glob)</td></tr>
|
||
<tr><td><code>--auto</code></td><td>Auto-detect and apply document profile</td></tr>
|
||
<tr><td><code>--profile NAME</code></td><td>Use specific profile (skip auto-detection)</td></tr>
|
||
<tr><td><code>--password PASS</code></td><td>Password for encrypted PDFs</td></tr>
|
||
<tr><td><code>--pages N-M</code></td><td>Extract specific page range</td></tr>
|
||
<tr><td><code>--ocr</code></td><td>Force OCR mode for all pages</td></tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<p>See <a href="cli">CLI Reference</a> for complete command documentation.</p>
|
||
<h2 id="whats-next"><a class="header" href="#whats-next">What’s Next?</a></h2>
|
||
<ul>
|
||
<li>Explore the <a href="cli">CLI Reference</a> for advanced options</li>
|
||
<li>Read <a href="schema">JSON Schema Reference</a> for output format details</li>
|
||
<li>Check <a href="profiles">Profiles</a> for document-type-specific extraction</li>
|
||
<li>Try the <a href="#python-sdk">Python SDK</a> for programmatic access</li>
|
||
</ul>
|
||
<h2 id="troubleshooting"><a class="header" href="#troubleshooting">Troubleshooting</a></h2>
|
||
<p><strong>Extraction fails with “unsupported encryption”</strong></p>
|
||
<p>The PDF is encrypted with a password. Use <code>--password</code>:</p>
|
||
<pre><code class="language-bash">pdftract extract encrypted.pdf --password yourpassword
|
||
</code></pre>
|
||
<p><strong>Output has wrong reading order</strong></p>
|
||
<p>Some PDFs have malformed internal structure. Try <code>--auto</code> to enable profile-based layout recovery, or use <code>--ocr</code> to force OCR-based extraction.</p>
|
||
<p><strong>Poor accuracy on scanned documents</strong></p>
|
||
<p>Ensure the OCR features are enabled. The Docker <code>:ocr</code> and <code>:full</code> images include Tesseract. If building from source, enable the <code>ocr</code> feature:</p>
|
||
<pre><code class="language-bash">cargo install pdftract --features ocr
|
||
</code></pre>
|
||
<p>For more help, see <a href="troubleshooting">Troubleshooting</a>.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="cli-reference"><a class="header" href="#cli-reference">CLI Reference</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Complete command-line interface documentation.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="global-options"><a class="header" href="#global-options">Global Options</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>See the main pdftract repository for CLI usage details.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="extract"><a class="header" href="#extract">extract</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extract text and structure from a PDF file.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="serve"><a class="header" href="#serve">serve</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Start an HTTP server for PDF extraction.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="grep"><a class="header" href="#grep">grep</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Search for text across multiple PDF files.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="inspect"><a class="header" href="#inspect">inspect</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Inspect PDF structure and metadata.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="mcp"><a class="header" href="#mcp">mcp</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Run pdftract as an MCP (Model Context Protocol) server.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="json-schema-reference"><a class="header" href="#json-schema-reference">JSON Schema Reference</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Complete JSON output format documentation.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="output-format"><a class="header" href="#output-format">Output Format</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the JSON schema for pdftract output.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="block-types"><a class="header" href="#block-types">Block Types</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the semantic block types (heading, paragraph, list, table, etc.).</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="metadata"><a class="header" href="#metadata">Metadata</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes the document metadata fields.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="error-handling"><a class="header" href="#error-handling">Error Handling</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Describes how errors are reported in the JSON output.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="profiles"><a class="header" href="#profiles">Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Document-type-specific extraction profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="available-profiles"><a class="header" href="#available-profiles">Available Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Lists all available document profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="invoice-profile"><a class="header" href="#invoice-profile">invoice Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for invoice documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="receipt-profile"><a class="header" href="#receipt-profile">receipt Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for receipt documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="bank_statement-profile"><a class="header" href="#bank_statement-profile">bank_statement Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for bank statement documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="contract-profile"><a class="header" href="#contract-profile">contract Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for contract documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="legal_filing-profile"><a class="header" href="#legal_filing-profile">legal_filing Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for legal filing documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="form-profile"><a class="header" href="#form-profile">form Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for form documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="scientific_paper-profile"><a class="header" href="#scientific_paper-profile">scientific_paper Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for scientific paper documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="book_chapter-profile"><a class="header" href="#book_chapter-profile">book_chapter Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for book chapter documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="slide_deck-profile"><a class="header" href="#slide_deck-profile">slide_deck Profile</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extraction configuration for slide deck documents.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="custom-profiles"><a class="header" href="#custom-profiles">Custom Profiles</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How to create and use custom extraction profiles.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="sdk-quickstarts"><a class="header" href="#sdk-quickstarts">SDK Quickstarts</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Getting started guides for using pdftract from various programming languages.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="python-sdk"><a class="header" href="#python-sdk">Python SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Python.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="rust-sdk"><a class="header" href="#rust-sdk">Rust SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Rust.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="javascripttypescript-sdk"><a class="header" href="#javascripttypescript-sdk">JavaScript/TypeScript SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from JavaScript/TypeScript (Node.js).</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="go-sdk"><a class="header" href="#go-sdk">Go SDK</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract from Go.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="advanced-topics"><a class="header" href="#advanced-topics">Advanced Topics</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Deep dives into pdftract’s internals and advanced configuration.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="ocr-configuration"><a class="header" href="#ocr-configuration">OCR Configuration</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Configuring Tesseract and OCR settings.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="font-encoding-recovery"><a class="header" href="#font-encoding-recovery">Font Encoding Recovery</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How pdftract recovers text from fonts with broken or missing ToUnicode mappings.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="structure-tree-extraction"><a class="header" href="#structure-tree-extraction">Structure Tree Extraction</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Extracting logical structure from tagged PDFs.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="hybrid-routing"><a class="header" href="#hybrid-routing">Hybrid Routing</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>How pdftract routes each page to the optimal extraction pipeline.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="provenance-and-confidence"><a class="header" href="#provenance-and-confidence">Provenance and Confidence</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Understanding bounding boxes, font metadata, and confidence scores.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="troubleshooting-1"><a class="header" href="#troubleshooting-1">Troubleshooting</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This section is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Debugging and performance tuning for pdftract.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="common-issues"><a class="header" href="#common-issues">Common Issues</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Solutions to common extraction problems.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="diagnostics"><a class="header" href="#diagnostics">Diagnostics</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Using pdftract’s diagnostic features for debugging.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="performance-tuning"><a class="header" href="#performance-tuning">Performance Tuning</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Optimizing extraction speed and memory usage.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div>
|
||
<h1 id="faq"><a class="header" href="#faq">FAQ</a></h1>
|
||
<blockquote>
|
||
<p><strong>Draft</strong> — This page is a placeholder for future content.</p>
|
||
</blockquote>
|
||
<p>Frequently asked questions about pdftract.</p>
|
||
|
||
</main>
|
||
|
||
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
<!-- Mobile navigation buttons -->
|
||
|
||
|
||
<div style="clear: both"></div>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
|
||
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
||
</nav>
|
||
|
||
</div>
|
||
|
||
<template id=fa-eye><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M288 32c-80.8 0-145.5 36.8-192.6 80.6C48.6 156 17.3 208 2.5 243.7c-3.3 7.9-3.3 16.7 0 24.6C17.3 304 48.6 356 95.4 399.4C142.5 443.2 207.2 480 288 480s145.5-36.8 192.6-80.6c46.8-43.5 78.1-95.4 93-131.1c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C433.5 68.8 368.8 32 288 32zM432 256c0 79.5-64.5 144-144 144s-144-64.5-144-144s64.5-144 144-144s144 64.5 144 144zM288 192c0 35.3-28.7 64-64 64c-11.5 0-22.3-3-31.6-8.4c-.2 2.8-.4 5.5-.4 8.4c0 53 43 96 96 96s96-43 96-96s-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6z"/></svg></span></template>
|
||
<template id=fa-eye-slash><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M38.8 5.1C28.4-3.1 13.3-1.2 5.1 9.2S-1.2 34.7 9.2 42.9l592 464c10.4 8.2 25.5 6.3 33.7-4.1s6.3-25.5-4.1-33.7L525.6 386.7c39.6-40.6 66.4-86.1 79.9-118.4c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C465.5 68.8 400.8 32 320 32c-68.2 0-125 26.3-169.3 60.8L38.8 5.1zM223.1 149.5C248.6 126.2 282.7 112 320 112c79.5 0 144 64.5 144 144c0 24.9-6.3 48.3-17.4 68.7L408 294.5c5.2-11.8 8-24.8 8-38.5c0-53-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6c0 10.2-2.4 19.8-6.6 28.3l-90.3-70.8zm223.1 298L373 389.9c-16.4 6.5-34.3 10.1-53 10.1c-79.5 0-144-64.5-144-144c0-6.9 .5-13.6 1.4-20.2L83.1 161.5C60.3 191.2 44 220.8 34.5 243.7c-3.3 7.9-3.3 16.7 0 24.6c14.9 35.7 46.2 87.7 93 131.1C174.5 443.2 239.2 480 320 480c47.8 0 89.9-12.9 126.2-32.5z"/></svg></span></template>
|
||
<template id=fa-copy><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M502.6 70.63l-61.25-61.25C435.4 3.371 427.2 0 418.7 0H255.1c-35.35 0-64 28.66-64 64l.0195 256C192 355.4 220.7 384 256 384h192c35.2 0 64-28.8 64-64V93.25C512 84.77 508.6 76.63 502.6 70.63zM464 320c0 8.836-7.164 16-16 16H255.1c-8.838 0-16-7.164-16-16L239.1 64.13c0-8.836 7.164-16 16-16h128L384 96c0 17.67 14.33 32 32 32h47.1V320zM272 448c0 8.836-7.164 16-16 16H63.1c-8.838 0-16-7.164-16-16L47.98 192.1c0-8.836 7.164-16 16-16H160V128H63.99c-35.35 0-64 28.65-64 64l.0098 256C.002 483.3 28.66 512 64 512h192c35.2 0 64-28.8 64-64v-32h-47.1L272 448z"/></svg></span></template>
|
||
<template id=fa-play><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M73 39c-14.8-9.1-33.4-9.4-48.5-.9S0 62.6 0 80V432c0 17.4 9.4 33.4 24.5 41.9s33.7 8.1 48.5-.9L361 297c14.3-8.7 23-24.2 23-41s-8.7-32.2-23-41L73 39z"/></svg></span></template>
|
||
<template id=fa-clock-rotate-left><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M75 75L41 41C25.9 25.9 0 36.6 0 57.9V168c0 13.3 10.7 24 24 24H134.1c21.4 0 32.1-25.9 17-41l-30.8-30.8C155 85.5 203 64 256 64c106 0 192 86 192 192s-86 192-192 192c-40.8 0-78.6-12.7-109.7-34.4c-14.5-10.1-34.4-6.6-44.6 7.9s-6.6 34.4 7.9 44.6C151.2 495 201.7 512 256 512c141.4 0 256-114.6 256-256S397.4 0 256 0C185.3 0 121.3 28.7 75 75zm181 53c-13.3 0-24 10.7-24 24V256c0 6.4 2.5 12.5 7 17l72 72c9.4 9.4 24.6 9.4 33.9 0s9.4-24.6 0-33.9l-65-65V152c0-13.3-10.7-24-24-24z"/></svg></span></template>
|
||
|
||
|
||
|
||
<script>
|
||
window.playground_copyable = true;
|
||
</script>
|
||
|
||
|
||
<script src="elasticlunr-ef4e11c1.min.js"></script>
|
||
<script src="mark-09e88c2c.min.js"></script>
|
||
<script src="searcher-c2a407aa.js"></script>
|
||
|
||
<script src="clipboard-1626706a.min.js"></script>
|
||
<script src="highlight-abc7f01d.js"></script>
|
||
<script src="book-a0b12cfe.js"></script>
|
||
|
||
<!-- Custom JS scripts -->
|
||
|
||
<script>
|
||
window.addEventListener('load', function() {
|
||
window.setTimeout(window.print, 100);
|
||
});
|
||
</script>
|
||
|
||
|
||
</div>
|
||
</body>
|
||
</html>
|