pdftract/docs/user-docs/build/user-docs/faq.html
jedarden 6000c654ce fix: resolve compilation errors across codebase
- Fixed missing fields in BlockJson, SpanJson, ExtractionOptions initializations
- Added feature gates to ocr_integration tests for conditional compilation
- Fixed McpServerState::new calls to include audit writer argument
- Fixed CCITTFaxDecoder::decode calls to use instance method
- Fixed type casts for ObjRef::new calls
- Fixed serde_json::Value method calls (is_some -> !is_null)
- Fixed ProfileType test feature gates
- Worked around lifetime issues in schema roundtrip tests

These changes fix numerous compilation errors that were blocking the
codebase from building. The main library and tests now compile successfully.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 08:38:04 -04:00

599 lines
40 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>FAQ - pdftract User Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="favicon-de23e50b.svg">
<link rel="shortcut icon" href="favicon-8114d1fc.png">
<link rel="stylesheet" href="css/variables-8adf115d.css">
<link rel="stylesheet" href="css/general-2459343d.css">
<link rel="stylesheet" href="css/chrome-ae938929.css">
<link rel="stylesheet" href="css/print-9e4910d8.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="fonts/fonts-9644e21d.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="mdbook-highlight-css" href="highlight-493f70e1.css">
<link rel="stylesheet" id="mdbook-tomorrow-night-css" href="tomorrow-night-4c0ae647.css">
<link rel="stylesheet" id="mdbook-ayu-highlight-css" href="ayu-highlight-3fdfc3ac.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "";
const default_light_theme = "light";
const default_dark_theme = "navy";
window.path_to_searchindex_js = "searchindex-fc6d8bf8.js";
</script>
<!-- Start loading toc.js asap -->
<script src="toc-d0f907c9.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="mdbook-body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="mdbook-sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("mdbook-sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
sidebar_toggle.checked = false;
}
if (sidebar === 'visible') {
sidebar_toggle.checked = true;
} else {
html.classList.remove('sidebar-visible');
}
</script>
<nav id="mdbook-sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="toc.html"></iframe>
</noscript>
<div id="mdbook-sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="mdbook-page-wrapper" class="page-wrapper">
<div class="page">
<div id="mdbook-menu-bar-hover-placeholder"></div>
<div id="mdbook-menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="mdbook-sidebar-toggle" class="icon-button" for="mdbook-sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="mdbook-sidebar">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"/></svg></span>
</label>
<button id="mdbook-theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="mdbook-theme-list">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M371.3 367.1c27.3-3.9 51.9-19.4 67.2-42.9L600.2 74.1c12.6-19.5 9.4-45.3-7.6-61.2S549.7-4.4 531.1 9.6L294.4 187.2c-24 18-38.2 46.1-38.4 76.1L371.3 367.1zm-19.6 25.4l-116-104.4C175.9 290.3 128 339.6 128 400c0 3.9 .2 7.8 .6 11.6c1.8 17.5-10.2 36.4-27.8 36.4H96c-17.7 0-32 14.3-32 32s14.3 32 32 32H240c61.9 0 112-50.1 112-112c0-2.5-.1-5-.2-7.5z"/></svg></span>
</button>
<ul id="mdbook-theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-ayu">Ayu</button></li>
</ul>
<button id="mdbook-search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="mdbook-searchbar">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M416 208c0 45.9-14.9 88.3-40 122.7L502.6 457.4c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L330.7 376c-34.4 25.2-76.8 40-122.7 40C93.1 416 0 322.9 0 208S93.1 0 208 0S416 93.1 416 208zM208 352c79.5 0 144-64.5 144-144s-64.5-144-144-144S64 128.5 64 208s64.5 144 144 144z"/></svg></span>
</button>
</div>
<h1 class="menu-title">pdftract User Documentation</h1>
<div class="right-buttons">
<a href="print.html" title="Print this book" aria-label="Print this book">
<span class=fa-svg id="print-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M128 0C92.7 0 64 28.7 64 64v96h64V64H354.7L384 93.3V160h64V93.3c0-17-6.7-33.3-18.7-45.3L400 18.7C388 6.7 371.7 0 354.7 0H128zM384 352v32 64H128V384 368 352H384zm64 32h32c17.7 0 32-14.3 32-32V256c0-35.3-28.7-64-64-64H64c-35.3 0-64 28.7-64 64v96c0 17.7 14.3 32 32 32H64v64c0 35.3 28.7 64 64 64H384c35.3 0 64-28.7 64-64V384zm-16-88c-13.3 0-24-10.7-24-24s10.7-24 24-24s24 10.7 24 24s-10.7 24-24 24z"/></svg></span>
</a>
<a href="https://github.com/jedarden/pdftract" title="Git repository" aria-label="Git repository">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg></span>
</a>
<a href="https://github.com/jedarden/pdftract/edit/main/docs/user-docs/src/src/faq.md" title="Suggest an edit" aria-label="Suggest an edit" rel="edit">
<span class=fa-svg id="git-edit-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M421.7 220.3l-11.3 11.3-22.6 22.6-205 205c-6.6 6.6-14.8 11.5-23.8 14.1L30.8 511c-8.4 2.5-17.5 .2-23.7-6.1S-1.5 489.7 1 481.2L38.7 353.1c2.6-9 7.5-17.2 14.1-23.8l205-205 22.6-22.6 11.3-11.3 33.9 33.9 62.1 62.1 33.9 33.9zM96 353.9l-9.3 9.3c-.9 .9-1.6 2.1-2 3.4l-25.3 86 86-25.3c1.3-.4 2.5-1.1 3.4-2l9.3-9.3H112c-8.8 0-16-7.2-16-16V353.9zM453.3 19.3l39.4 39.4c25 25 25 65.5 0 90.5l-14.5 14.5-22.6 22.6-11.3 11.3-33.9-33.9-62.1-62.1L314.3 67.7l11.3-11.3 22.6-22.6 14.5-14.5c25-25 65.5-25 90.5 0z"/></svg></span>
</a>
</div>
</div>
<div id="mdbook-search-wrapper" class="hidden">
<form id="mdbook-searchbar-outer" class="searchbar-outer">
<div class="search-wrapper">
<input type="search" id="mdbook-searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="mdbook-searchresults-outer" aria-describedby="searchresults-header">
<div class="spinner-wrapper">
<span class=fa-svg id="fa-spin"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M304 48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zm0 416c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM48 304c26.5 0 48-21.5 48-48s-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48zm464-48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM142.9 437c18.7-18.7 18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zm0-294.2c18.7-18.7 18.7-49.1 0-67.9S93.7 56.2 75 75s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zM369.1 437c18.7 18.7 49.1 18.7 67.9 0s18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9z"/></svg></span>
</div>
</div>
</form>
<div id="mdbook-searchresults-outer" class="searchresults-outer hidden">
<div id="mdbook-searchresults-header" class="searchresults-header"></div>
<ul id="mdbook-searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('mdbook-sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('mdbook-sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#mdbook-sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="mdbook-content" class="content">
<main>
<h1 id="faq"><a class="header" href="#faq">FAQ</a></h1>
<p>Frequently asked questions about pdftract.</p>
<h2 id="table-of-contents"><a class="header" href="#table-of-contents">Table of Contents</a></h2>
<ul>
<li><a href="#general">General</a>
<ul>
<li><a href="#what-is-pdftract">What is pdftract?</a></li>
<li><a href="#whats-the-difference-between-extract-and-extract_text">Whats the difference between extract and extract_text?</a></li>
<li><a href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></li>
<li><a href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></li>
</ul>
</li>
<li><a href="#installation-and-setup">Installation and Setup</a>
<ul>
<li><a href="#how-do-i-install-pdftract">How do I install pdftract?</a></li>
<li><a href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></li>
<li><a href="#what-are-the-system-requirements">What are the system requirements?</a></li>
</ul>
</li>
<li><a href="#usage">Usage</a>
<ul>
<li><a href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></li>
<li><a href="#why-is-ocr-slow">Why is OCR slow?</a></li>
<li><a href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></li>
<li><a href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></li>
<li><a href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></li>
</ul>
</li>
<li><a href="#configuration">Configuration</a>
<ul>
<li><a href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></li>
<li><a href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></li>
<li><a href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></li>
<li><a href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></li>
</ul>
</li>
<li><a href="#output-and-formats">Output and Formats</a>
<ul>
<li><a href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></li>
<li><a href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></li>
<li><a href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></li>
<li><a href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></li>
</ul>
</li>
<li><a href="#troubleshooting">Troubleshooting</a>
<ul>
<li><a href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></li>
<li><a href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></li>
<li><a href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></li>
<li><a href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></li>
</ul>
</li>
</ul>
<hr>
<h2 id="general"><a class="header" href="#general">General</a></h2>
<h3 id="what-is-pdftract"><a class="header" href="#what-is-pdftract">What is pdftract?</a></h3>
<p>pdftract is a command-line tool and library for extracting text, structure, and content from PDF files. It combines vector text extraction with OCR fallback to handle both well-formed and problematic PDFs. pdftract is written in Rust and provides Python bindings for programmatic use.</p>
<p>See the <a href="introduction.html">Introduction</a> for a complete overview.</p>
<h3 id="whats-the-difference-between-extract-and-extract_text"><a class="header" href="#whats-the-difference-between-extract-and-extract_text">Whats the difference between extract and extract_text?</a></h3>
<ul>
<li>
<p><strong><code>extract</code></strong>: The primary command that produces structured JSON output with blocks, spans, metadata, and provenance information. Use this when you need the full extraction with layout, reading order, and confidence scores.</p>
</li>
<li>
<p><strong><code>extract_text</code></strong>: A simplified command that outputs plain text only. Use this for quick text extraction when you dont need the structured JSON output.</p>
</li>
</ul>
<p>Example:</p>
<pre><code class="language-bash"># Full structured extraction
pdftract extract document.pdf -o output.json
# Plain text only
pdftract extract_text document.pdf -o output.txt
</code></pre>
<h3 id="does-pdftract-execute-javascript-embedded-in-pdfs"><a class="header" href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></h3>
<p><strong>No.</strong> pdftract never executes JavaScript embedded in PDFs. JavaScript is detected during parsing for security analysis, but it is never executed. This design prevents malicious PDFs from exploiting JavaScript vulnerabilities.</p>
<p>If you need to analyze JavaScript in PDFs, pdftract can detect and report its presence, but execution must be done separately with appropriate sandboxing.</p>
<h3 id="how-do-i-cite-an-extracted-snippet"><a class="header" href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></h3>
<p>The JSON output from <code>pdftract extract</code> includes provenance information for each text block:</p>
<pre><code class="language-json">{
"blocks": [{
"spans": [{
"text": "Example snippet",
"bbox": [100.0, 200.0, 250.0, 215.0],
"page": 3,
"confidence": 0.98
}]
}],
"metadata": {
"path": "/path/to/document.pdf",
"fingerprint": "sha256:abc123...",
"extracted_at": "2026-05-25T12:00:00Z"
}
}
</code></pre>
<p>For academic citations, include:</p>
<ul>
<li>Document path and fingerprint</li>
<li>Page number (from the <code>page</code> field)</li>
<li>Extraction timestamp</li>
<li>The pdftract version used</li>
</ul>
<hr>
<h2 id="installation-and-setup"><a class="header" href="#installation-and-setup">Installation and Setup</a></h2>
<h3 id="how-do-i-install-pdftract"><a class="header" href="#how-do-i-install-pdftract">How do I install pdftract?</a></h3>
<p>See the <a href="installation.html">Installation</a> guide for complete instructions. Quick summary:</p>
<p><strong>With cargo (Rust toolchain):</strong></p>
<pre><code class="language-bash">cargo install pdftract
</code></pre>
<p><strong>With pip (Python bindings):</strong></p>
<pre><code class="language-bash">pip install pdftract
</code></pre>
<p><strong>Pre-built binaries:</strong> Download from the <a href="https://github.com/your-org/pdftract/releases">releases page</a>.</p>
<h3 id="how-do-i-run-pdftract-behind-a-corporate-proxy"><a class="header" href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></h3>
<p>pdftract doesnt have built-in proxy support, but you can use the HTTP serve mode with a reverse proxy:</p>
<ol>
<li>Start pdftract in serve mode:</li>
</ol>
<pre><code class="language-bash">pdftract serve --port 8080
</code></pre>
<ol start="2">
<li>
<p>Configure your reverse proxy (nginx, Apache, etc.) to handle authentication and SSL termination.</p>
</li>
<li>
<p>Access pdftract through your proxy endpoint.</p>
</li>
</ol>
<p>See <a href="../operations/serve-deployment.html">Advanced Topics: HTTP Serve</a> for deployment guidance.</p>
<h3 id="what-are-the-system-requirements"><a class="header" href="#what-are-the-system-requirements">What are the system requirements?</a></h3>
<ul>
<li><strong>OS</strong>: Linux, macOS, or Windows</li>
<li><strong>Rust</strong>: 1.70+ (if building from source)</li>
<li><strong>Python</strong>: 3.8+ (for Python bindings)</li>
<li><strong>OCR (optional)</strong>: Tesseract 4.0+ for OCR fallback</li>
<li><strong>Memory</strong>: 512 MB minimum for typical PDFs; more for large documents</li>
</ul>
<hr>
<h2 id="usage"><a class="header" href="#usage">Usage</a></h2>
<h3 id="why-is-my-pdf-returning-broken_vector"><a class="header" href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></h3>
<p>The <code>broken_vector</code> classification means the PDFs text layer is unreliable or missing. Common causes:</p>
<ul>
<li><strong>Invisible text overlay</strong>: Text with rendering mode 3 (invisible) overlaid on a raster image</li>
<li><strong>Missing ToUnicode CMap</strong>: Font lacks character-to-Unicode mapping</li>
<li><strong>Encoding corruption</strong>: Character encodings dont match the actual glyphs</li>
</ul>
<p><strong>Solution</strong>: pdftract automatically routes <code>broken_vector</code> pages to the OCR pipeline (Phase 5.5). If you see <code>broken_vector</code> without OCR output, check that OCR is enabled:</p>
<pre><code class="language-bash"># Verify OCR is available
pdftract doctor tesseract-langs
# Enable OCR explicitly if needed
pdftract extract document.pdf --enable-ocr
</code></pre>
<p>See <a href="troubleshooting/common-issues.html">Troubleshooting: Broken Vector</a> for more details.</p>
<h3 id="why-is-ocr-slow"><a class="header" href="#why-is-ocr-slow">Why is OCR slow?</a></h3>
<p>OCR performance depends on several factors:</p>
<ul>
<li><strong>Image resolution</strong>: Higher DPI images take longer to process</li>
<li><strong>Tesseract version</strong>: Version 4.0+ is significantly faster than 3.x</li>
<li><strong>Language data</strong>: Additional language packs increase processing time</li>
<li><strong>Hardware</strong>: CPU-bound; more cores help with batch processing</li>
</ul>
<p><strong>To speed up OCR:</strong></p>
<pre><code class="language-bash"># Reduce DPI (trade-off: accuracy)
pdftract extract document.pdf --ocr-dpi 200
# Use fewer languages
pdftract extract document.pdf --ocr-lang eng
# Disable OCR for vector-only PDFs
pdftract extract document.pdf --disable-ocr
</code></pre>
<h3 id="how-do-i-extract-text-from-a-specific-page-range"><a class="header" href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></h3>
<p>Use the <code>--pages</code> flag:</p>
<pre><code class="language-bash"># Single page
pdftract extract document.pdf --pages 5
# Range
pdftract extract document.pdf --pages 1-10
# Multiple ranges
pdftract extract document.pdf --pages 1-5,10,15-20
# All pages from page 5 onward
pdftract extract document.pdf --pages 5-
</code></pre>
<h3 id="how-do-i-extract-images-from-a-pdf"><a class="header" href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></h3>
<p>pdftract automatically detects and records image XObjects during content stream processing. The output JSON includes image metadata:</p>
<pre><code class="language-json">{
"images": [{
"bbox": [100.0, 200.0, 400.0, 500.0],
"xobject_ref": "5 0 R",
"name": "Im1"
}]
}
</code></pre>
<p>For actual image extraction, use the <code>serve</code> mode with the <code>/images</code> endpoint or write a custom script using the Python SDK.</p>
<h3 id="can-i-process-multiple-pdfs-at-once"><a class="header" href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></h3>
<p>Yes, use shell wildcards or write a batch script:</p>
<pre><code class="language-bash"># Process all PDFs in a directory
for file in *.pdf; do
pdftract extract "$file" -o "output/$(basename "$file" .json)"
done
# With parallel processing (GNU parallel)
ls *.pdf | parallel -j 4 pdftract extract {} -o output/{/.}.json
</code></pre>
<hr>
<h2 id="configuration"><a class="header" href="#configuration">Configuration</a></h2>
<h3 id="how-do-i-add-a-custom-profile"><a class="header" href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></h3>
<p>Create a YAML file defining your profile:</p>
<pre><code class="language-yaml"># custom-profile.yaml
name: my_custom
description: "Custom extraction profile"
extraction:
preserve_tables: true
preserve_columns: true
ocr_fallback: true
output:
format: json
include_provenance: true
confidence_threshold: 0.7
</code></pre>
<p>Then use it:</p>
<pre><code class="language-bash">pdftract extract document.pdf --profile custom-profile.yaml
</code></pre>
<p>See <a href="profiles/custom.html">Custom Profiles</a> for complete documentation.</p>
<h3 id="how-do-i-adjust-ocr-accuracy"><a class="header" href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></h3>
<p>Adjust Tesseract parameters via environment variables or the OCR configuration:</p>
<pre><code class="language-bash"># Set OCR engine mode
export TESSERACT_OEM=1 # LSTM only
export TESSERACT_PSM=6 # Assume single column block of text
# Adjust page segmentation mode
pdftract extract document.pdf --tesseract-psm 6
</code></pre>
<p>Higher accuracy settings may slow down processing. See <a href="advanced/ocr.html">OCR Configuration</a> for details.</p>
<h3 id="how-do-i-disable-ocr-for-faster-processing"><a class="header" href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></h3>
<p>If you know your PDFs have reliable text layers:</p>
<pre><code class="language-bash">pdftract extract document.pdf --disable-ocr
</code></pre>
<p>Or set a confidence threshold to skip low-confidence text:</p>
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.9
</code></pre>
<h3 id="what-are-confidence-scores-and-how-do-i-use-them"><a class="header" href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></h3>
<p>Each text span has a <code>confidence</code> score (0.0 to 1.0):</p>
<ul>
<li><strong>1.0</strong>: High confidence (ToUnicode CMap lookup succeeded)</li>
<li><strong>0.3</strong>: Medium confidence (encoding + AGL fallback)</li>
<li><strong>0.0</strong>: No confidence (PositionHint mode or failed resolution)</li>
</ul>
<p>Filter by confidence:</p>
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.5
</code></pre>
<p>Or filter in post-processing using jq:</p>
<pre><code class="language-bash">pdftract extract document.pdf | jq '.blocks[].spans[] | select(.confidence &gt; 0.5)'
</code></pre>
<hr>
<h2 id="output-and-formats"><a class="header" href="#output-and-formats">Output and Formats</a></h2>
<h3 id="how-do-i-get-output-in-markdown-format"><a class="header" href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></h3>
<p>Use the <code>--format</code> flag:</p>
<pre><code class="language-bash">pdftract extract document.pdf --format markdown -o output.md
</code></pre>
<p>The Markdown output preserves headings, lists, tables, and code blocks where detected.</p>
<h3 id="how-do-i-preserve-table-structure"><a class="header" href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></h3>
<p>pdftract includes table detection (Phase 4.2). Ensure table preservation is enabled:</p>
<pre><code class="language-bash">pdftract extract document.pdf --preserve-tables
</code></pre>
<p>Tables are output with structured cell information:</p>
<pre><code class="language-json">{
"type": "table",
"rows": 3,
"columns": 4,
"cells": [...]
}
</code></pre>
<h3 id="can-i-extract-metadata-from-pdfs"><a class="header" href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></h3>
<p>Yes, metadata is automatically extracted and included in the output:</p>
<pre><code class="language-json">{
"metadata": {
"title": "Document Title",
"author": "Author Name",
"subject": "Subject",
"keywords": ["keyword1", "keyword2"],
"creator": "Application",
"producer": "PDF Producer",
"creation_date": "2026-01-01T00:00:00Z",
"modified_date": "2026-05-25T12:00:00Z"
}
}
</code></pre>
<h3 id="how-do-i-handle-password-protected-pdfs"><a class="header" href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></h3>
<p>Provide the password via the <code>--password</code> flag:</p>
<pre><code class="language-bash">pdftract extract document.pdf --password secret123
</code></pre>
<p>For security, avoid passing passwords on the command line in production. Use environment variables or a config file:</p>
<pre><code class="language-bash">export PDFTRACT_PASSWORD=secret123
pdftract extract document.pdf
</code></pre>
<hr>
<h2 id="troubleshooting"><a class="header" href="#troubleshooting">Troubleshooting</a></h2>
<h3 id="why-is-extraction-failing-with-an-error"><a class="header" href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></h3>
<p>Check the error message and consult the <a href="troubleshooting/README.html">Troubleshooting Guide</a>. Common issues:</p>
<ul>
<li><strong>Encrypted PDFs</strong>: Use <code>--password</code> to decrypt</li>
<li><strong>Corrupted PDFs</strong>: pdftract attempts recovery; check diagnostics</li>
<li><strong>Missing dependencies</strong>: Verify Tesseract and language packs are installed</li>
</ul>
<p>Run diagnostics:</p>
<pre><code class="language-bash">pdftract doctor
</code></pre>
<h3 id="why-is-my-output-empty-or-incomplete"><a class="header" href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></h3>
<p>Possible causes:</p>
<ol>
<li><strong>No text layer</strong>: PDF may be image-only. Enable OCR.</li>
<li><strong>Encoding issues</strong>: Check diagnostics for <code>FONT_GLYPH_UNMAPPED</code> warnings</li>
<li><strong>Page range issue</strong>: Verify your <code>--pages</code> argument</li>
<li><strong>Confidence filter</strong>: Lower <code>--min-confidence</code> if set too high</li>
</ol>
<p>Check diagnostics output:</p>
<pre><code class="language-bash">pdftract extract document.json --verbose
</code></pre>
<h3 id="how-do-i-debug-extraction-issues"><a class="header" href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></h3>
<p>Enable verbose output and diagnostics:</p>
<pre><code class="language-bash"># Full diagnostic output
pdftract extract document.pdf --verbose --diagnostics
# Save diagnostics for analysis
pdftract extract document.pdf --diagnostics -o diagnostics.json
</code></pre>
<p>Common diagnostic codes:</p>
<ul>
<li><code>FONT_GLYPH_UNMAPPED</code>: Glyph couldnt be mapped to Unicode</li>
<li><code>STREAM_DECODE_ERROR</code>: Stream decompression failed</li>
<li><code>STRUCT_INVALID_TYPE</code>: Unexpected object type</li>
</ul>
<p>See <a href="troubleshooting/diagnostics.html">Diagnostics Reference</a> for a complete list.</p>
<h3 id="why-does-extraction-use-so-much-memory"><a class="header" href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></h3>
<p>Memory usage depends on:</p>
<ul>
<li><strong>PDF size</strong>: Larger PDFs with many images use more memory</li>
<li><strong>OCR</strong>: Tesseract loads image data into memory</li>
<li><strong>Output buffering</strong>: Large JSON outputs are buffered in memory</li>
</ul>
<p><strong>To reduce memory usage:</strong></p>
<pre><code class="language-bash"># Process page-by-page
for page in {1..100}; do
pdftract extract document.pdf --pages $page -o "page-$page.json"
done
# Disable OCR if not needed
pdftract extract document.pdf --disable-ocr
# Stream output (if supported)
pdftract extract document.pdf --stream-output
</code></pre>
<hr>
<h2 id="still-have-questions"><a class="header" href="#still-have-questions">Still have questions?</a></h2>
<ul>
<li>Check the <a href="troubleshooting/README.html">Troubleshooting Guide</a></li>
<li>Review the <a href="cli/README.html">CLI Reference</a></li>
<li>Open an issue on <a href="https://github.com/your-org/pdftract/issues">GitHub</a></li>
</ul>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="troubleshooting/performance.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="troubleshooting/performance.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
</a>
</nav>
</div>
<template id=fa-eye><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M288 32c-80.8 0-145.5 36.8-192.6 80.6C48.6 156 17.3 208 2.5 243.7c-3.3 7.9-3.3 16.7 0 24.6C17.3 304 48.6 356 95.4 399.4C142.5 443.2 207.2 480 288 480s145.5-36.8 192.6-80.6c46.8-43.5 78.1-95.4 93-131.1c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C433.5 68.8 368.8 32 288 32zM432 256c0 79.5-64.5 144-144 144s-144-64.5-144-144s64.5-144 144-144s144 64.5 144 144zM288 192c0 35.3-28.7 64-64 64c-11.5 0-22.3-3-31.6-8.4c-.2 2.8-.4 5.5-.4 8.4c0 53 43 96 96 96s96-43 96-96s-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6z"/></svg></span></template>
<template id=fa-eye-slash><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M38.8 5.1C28.4-3.1 13.3-1.2 5.1 9.2S-1.2 34.7 9.2 42.9l592 464c10.4 8.2 25.5 6.3 33.7-4.1s6.3-25.5-4.1-33.7L525.6 386.7c39.6-40.6 66.4-86.1 79.9-118.4c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C465.5 68.8 400.8 32 320 32c-68.2 0-125 26.3-169.3 60.8L38.8 5.1zM223.1 149.5C248.6 126.2 282.7 112 320 112c79.5 0 144 64.5 144 144c0 24.9-6.3 48.3-17.4 68.7L408 294.5c5.2-11.8 8-24.8 8-38.5c0-53-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6c0 10.2-2.4 19.8-6.6 28.3l-90.3-70.8zm223.1 298L373 389.9c-16.4 6.5-34.3 10.1-53 10.1c-79.5 0-144-64.5-144-144c0-6.9 .5-13.6 1.4-20.2L83.1 161.5C60.3 191.2 44 220.8 34.5 243.7c-3.3 7.9-3.3 16.7 0 24.6c14.9 35.7 46.2 87.7 93 131.1C174.5 443.2 239.2 480 320 480c47.8 0 89.9-12.9 126.2-32.5z"/></svg></span></template>
<template id=fa-copy><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M502.6 70.63l-61.25-61.25C435.4 3.371 427.2 0 418.7 0H255.1c-35.35 0-64 28.66-64 64l.0195 256C192 355.4 220.7 384 256 384h192c35.2 0 64-28.8 64-64V93.25C512 84.77 508.6 76.63 502.6 70.63zM464 320c0 8.836-7.164 16-16 16H255.1c-8.838 0-16-7.164-16-16L239.1 64.13c0-8.836 7.164-16 16-16h128L384 96c0 17.67 14.33 32 32 32h47.1V320zM272 448c0 8.836-7.164 16-16 16H63.1c-8.838 0-16-7.164-16-16L47.98 192.1c0-8.836 7.164-16 16-16H160V128H63.99c-35.35 0-64 28.65-64 64l.0098 256C.002 483.3 28.66 512 64 512h192c35.2 0 64-28.8 64-64v-32h-47.1L272 448z"/></svg></span></template>
<template id=fa-play><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M73 39c-14.8-9.1-33.4-9.4-48.5-.9S0 62.6 0 80V432c0 17.4 9.4 33.4 24.5 41.9s33.7 8.1 48.5-.9L361 297c14.3-8.7 23-24.2 23-41s-8.7-32.2-23-41L73 39z"/></svg></span></template>
<template id=fa-clock-rotate-left><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M75 75L41 41C25.9 25.9 0 36.6 0 57.9V168c0 13.3 10.7 24 24 24H134.1c21.4 0 32.1-25.9 17-41l-30.8-30.8C155 85.5 203 64 256 64c106 0 192 86 192 192s-86 192-192 192c-40.8 0-78.6-12.7-109.7-34.4c-14.5-10.1-34.4-6.6-44.6 7.9s-6.6 34.4 7.9 44.6C151.2 495 201.7 512 256 512c141.4 0 256-114.6 256-256S397.4 0 256 0C185.3 0 121.3 28.7 75 75zm181 53c-13.3 0-24 10.7-24 24V256c0 6.4 2.5 12.5 7 17l72 72c9.4 9.4 24.6 9.4 33.9 0s9.4-24.6 0-33.9l-65-65V152c0-13.3-10.7-24-24-24z"/></svg></span></template>
<script>
window.playground_copyable = true;
</script>
<script src="elasticlunr-ef4e11c1.min.js"></script>
<script src="mark-09e88c2c.min.js"></script>
<script src="searcher-c2a407aa.js"></script>
<script src="clipboard-1626706a.min.js"></script>
<script src="highlight-abc7f01d.js"></script>
<script src="book-a0b12cfe.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>