pdftract/docs/user-docs/build/user-docs/faq.html

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
    <head>
        <!-- Book generated using mdBook -->
        <meta charset="UTF-8">
        <title>FAQ - pdftract User Documentation</title>


        <!-- Custom HTML head -->

        <meta name="description" content="">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="theme-color" content="#ffffff">

        <link rel="icon" href="favicon-de23e50b.svg">
        <link rel="shortcut icon" href="favicon-8114d1fc.png">
        <link rel="stylesheet" href="css/variables-8adf115d.css">
        <link rel="stylesheet" href="css/general-2459343d.css">
        <link rel="stylesheet" href="css/chrome-ae938929.css">
        <link rel="stylesheet" href="css/print-9e4910d8.css" media="print">

        <!-- Fonts -->
        <link rel="stylesheet" href="fonts/fonts-9644e21d.css">

        <!-- Highlight.js Stylesheets -->
        <link rel="stylesheet" id="mdbook-highlight-css" href="highlight-493f70e1.css">
        <link rel="stylesheet" id="mdbook-tomorrow-night-css" href="tomorrow-night-4c0ae647.css">
        <link rel="stylesheet" id="mdbook-ayu-highlight-css" href="ayu-highlight-3fdfc3ac.css">

        <!-- Custom theme stylesheets -->


        <!-- Provide site root and default themes to javascript -->
        <script>
            const path_to_root = "";
            const default_light_theme = "light";
            const default_dark_theme = "navy";
            window.path_to_searchindex_js = "searchindex-fc6d8bf8.js";
        </script>
        <!-- Start loading toc.js asap -->
        <script src="toc-d0f907c9.js"></script>
    </head>
    <body>
    <div id="mdbook-help-container">
        <div id="mdbook-help-popup">
            <h2 class="mdbook-help-title">Keyboard shortcuts</h2>
            <div>
                <p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
                <p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
                <p>Press <kbd>?</kbd> to show this help</p>
                <p>Press <kbd>Esc</kbd> to hide this help</p>
            </div>
        </div>
    </div>
    <div id="mdbook-body-container">
        <!-- Work around some values being stored in localStorage wrapped in quotes -->
        <script>
            try {
                let theme = localStorage.getItem('mdbook-theme');
                let sidebar = localStorage.getItem('mdbook-sidebar');

                if (theme.startsWith('"') && theme.endsWith('"')) {
                    localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
                }

                if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
                    localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
                }
            } catch (e) { }
        </script>

        <!-- Set the theme before any content is loaded, prevents flash -->
        <script>
            const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
            let theme;
            try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
            if (theme === null || theme === undefined) { theme = default_theme; }
            const html = document.documentElement;
            html.classList.remove('light')
            html.classList.add(theme);
            html.classList.add("js");
        </script>

        <input type="checkbox" id="mdbook-sidebar-toggle-anchor" class="hidden">

        <!-- Hide / unhide sidebar before it is displayed -->
        <script>
            let sidebar = null;
            const sidebar_toggle = document.getElementById("mdbook-sidebar-toggle-anchor");
            if (document.body.clientWidth >= 1080) {
                try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
                sidebar = sidebar || 'visible';
            } else {
                sidebar = 'hidden';
                sidebar_toggle.checked = false;
            }
            if (sidebar === 'visible') {
                sidebar_toggle.checked = true;
            } else {
                html.classList.remove('sidebar-visible');
            }
        </script>

        <nav id="mdbook-sidebar" class="sidebar" aria-label="Table of contents">
            <!-- populated by js -->
            <mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
            <noscript>
                <iframe class="sidebar-iframe-outer" src="toc.html"></iframe>
            </noscript>
            <div id="mdbook-sidebar-resize-handle" class="sidebar-resize-handle">
                <div class="sidebar-resize-indicator"></div>
            </div>
        </nav>

        <div id="mdbook-page-wrapper" class="page-wrapper">

            <div class="page">
                <div id="mdbook-menu-bar-hover-placeholder"></div>
                <div id="mdbook-menu-bar" class="menu-bar sticky">
                    <div class="left-buttons">
                        <label id="mdbook-sidebar-toggle" class="icon-button" for="mdbook-sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="mdbook-sidebar">
                            <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"/></svg></span>
                        </label>
                        <button id="mdbook-theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="mdbook-theme-list">
                            <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M371.3 367.1c27.3-3.9 51.9-19.4 67.2-42.9L600.2 74.1c12.6-19.5 9.4-45.3-7.6-61.2S549.7-4.4 531.1 9.6L294.4 187.2c-24 18-38.2 46.1-38.4 76.1L371.3 367.1zm-19.6 25.4l-116-104.4C175.9 290.3 128 339.6 128 400c0 3.9 .2 7.8 .6 11.6c1.8 17.5-10.2 36.4-27.8 36.4H96c-17.7 0-32 14.3-32 32s14.3 32 32 32H240c61.9 0 112-50.1 112-112c0-2.5-.1-5-.2-7.5z"/></svg></span>
                        </button>
                        <ul id="mdbook-theme-list" class="theme-popup" aria-label="Themes" role="menu">
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-default_theme">Auto</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-light">Light</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-rust">Rust</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-coal">Coal</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-navy">Navy</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="mdbook-theme-ayu">Ayu</button></li>
                        </ul>
                        <button id="mdbook-search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="mdbook-searchbar">
                            <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M416 208c0 45.9-14.9 88.3-40 122.7L502.6 457.4c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L330.7 376c-34.4 25.2-76.8 40-122.7 40C93.1 416 0 322.9 0 208S93.1 0 208 0S416 93.1 416 208zM208 352c79.5 0 144-64.5 144-144s-64.5-144-144-144S64 128.5 64 208s64.5 144 144 144z"/></svg></span>
                        </button>
                    </div>

                    <h1 class="menu-title">pdftract User Documentation</h1>

                    <div class="right-buttons">
                        <a href="print.html" title="Print this book" aria-label="Print this book">
                            <span class=fa-svg id="print-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M128 0C92.7 0 64 28.7 64 64v96h64V64H354.7L384 93.3V160h64V93.3c0-17-6.7-33.3-18.7-45.3L400 18.7C388 6.7 371.7 0 354.7 0H128zM384 352v32 64H128V384 368 352H384zm64 32h32c17.7 0 32-14.3 32-32V256c0-35.3-28.7-64-64-64H64c-35.3 0-64 28.7-64 64v96c0 17.7 14.3 32 32 32H64v64c0 35.3 28.7 64 64 64H384c35.3 0 64-28.7 64-64V384zm-16-88c-13.3 0-24-10.7-24-24s10.7-24 24-24s24 10.7 24 24s-10.7 24-24 24z"/></svg></span>
                        </a>
                        <a href="https://github.com/jedarden/pdftract" title="Git repository" aria-label="Git repository">
                            <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg></span>
                        </a>
                        <a href="https://github.com/jedarden/pdftract/edit/main/docs/user-docs/src/src/faq.md" title="Suggest an edit" aria-label="Suggest an edit" rel="edit">
                            <span class=fa-svg id="git-edit-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M421.7 220.3l-11.3 11.3-22.6 22.6-205 205c-6.6 6.6-14.8 11.5-23.8 14.1L30.8 511c-8.4 2.5-17.5 .2-23.7-6.1S-1.5 489.7 1 481.2L38.7 353.1c2.6-9 7.5-17.2 14.1-23.8l205-205 22.6-22.6 11.3-11.3 33.9 33.9 62.1 62.1 33.9 33.9zM96 353.9l-9.3 9.3c-.9 .9-1.6 2.1-2 3.4l-25.3 86 86-25.3c1.3-.4 2.5-1.1 3.4-2l9.3-9.3H112c-8.8 0-16-7.2-16-16V353.9zM453.3 19.3l39.4 39.4c25 25 25 65.5 0 90.5l-14.5 14.5-22.6 22.6-11.3 11.3-33.9-33.9-62.1-62.1L314.3 67.7l11.3-11.3 22.6-22.6 14.5-14.5c25-25 65.5-25 90.5 0z"/></svg></span>
                        </a>

                    </div>
                </div>

                <div id="mdbook-search-wrapper" class="hidden">
                    <form id="mdbook-searchbar-outer" class="searchbar-outer">
                        <div class="search-wrapper">
                            <input type="search" id="mdbook-searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="mdbook-searchresults-outer" aria-describedby="searchresults-header">
                            <div class="spinner-wrapper">
                                <span class=fa-svg id="fa-spin"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M304 48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zm0 416c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM48 304c26.5 0 48-21.5 48-48s-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48zm464-48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM142.9 437c18.7-18.7 18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zm0-294.2c18.7-18.7 18.7-49.1 0-67.9S93.7 56.2 75 75s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zM369.1 437c18.7 18.7 49.1 18.7 67.9 0s18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9z"/></svg></span>
                            </div>
                        </div>
                    </form>
                    <div id="mdbook-searchresults-outer" class="searchresults-outer hidden">
                        <div id="mdbook-searchresults-header" class="searchresults-header"></div>
                        <ul id="mdbook-searchresults">
                        </ul>
                    </div>
                </div>

                <!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
                <script>
                    document.getElementById('mdbook-sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
                    document.getElementById('mdbook-sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
                    Array.from(document.querySelectorAll('#mdbook-sidebar a')).forEach(function(link) {
                        link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
                    });
                </script>

                <div id="mdbook-content" class="content">
                    <main>
                        <h1 id="faq"><a class="header" href="#faq">FAQ</a></h1>
<p>Frequently asked questions about pdftract.</p>
<h2 id="table-of-contents"><a class="header" href="#table-of-contents">Table of Contents</a></h2>
<ul>
<li><a href="#general">General</a>
<ul>
<li><a href="#what-is-pdftract">What is pdftract?</a></li>
<li><a href="#whats-the-difference-between-extract-and-extract_text">What’s the difference between extract and extract_text?</a></li>
<li><a href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></li>
<li><a href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></li>
</ul>
</li>
<li><a href="#installation-and-setup">Installation and Setup</a>
<ul>
<li><a href="#how-do-i-install-pdftract">How do I install pdftract?</a></li>
<li><a href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></li>
<li><a href="#what-are-the-system-requirements">What are the system requirements?</a></li>
</ul>
</li>
<li><a href="#usage">Usage</a>
<ul>
<li><a href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></li>
<li><a href="#why-is-ocr-slow">Why is OCR slow?</a></li>
<li><a href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></li>
<li><a href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></li>
<li><a href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></li>
</ul>
</li>
<li><a href="#configuration">Configuration</a>
<ul>
<li><a href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></li>
<li><a href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></li>
<li><a href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></li>
<li><a href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></li>
</ul>
</li>
<li><a href="#output-and-formats">Output and Formats</a>
<ul>
<li><a href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></li>
<li><a href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></li>
<li><a href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></li>
<li><a href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></li>
</ul>
</li>
<li><a href="#troubleshooting">Troubleshooting</a>
<ul>
<li><a href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></li>
<li><a href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></li>
<li><a href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></li>
<li><a href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></li>
</ul>
</li>
</ul>
<hr>
<h2 id="general"><a class="header" href="#general">General</a></h2>
<h3 id="what-is-pdftract"><a class="header" href="#what-is-pdftract">What is pdftract?</a></h3>
<p>pdftract is a command-line tool and library for extracting text, structure, and content from PDF files. It combines vector text extraction with OCR fallback to handle both well-formed and problematic PDFs. pdftract is written in Rust and provides Python bindings for programmatic use.</p>
<p>See the <a href="introduction.html">Introduction</a> for a complete overview.</p>
<h3 id="whats-the-difference-between-extract-and-extract_text"><a class="header" href="#whats-the-difference-between-extract-and-extract_text">What’s the difference between extract and extract_text?</a></h3>
<ul>
<li>
<p><strong><code>extract</code></strong>: The primary command that produces structured JSON output with blocks, spans, metadata, and provenance information. Use this when you need the full extraction with layout, reading order, and confidence scores.</p>
</li>
<li>
<p><strong><code>extract_text</code></strong>: A simplified command that outputs plain text only. Use this for quick text extraction when you don’t need the structured JSON output.</p>
</li>
</ul>
<p>Example:</p>
<pre><code class="language-bash"># Full structured extraction
pdftract extract document.pdf -o output.json

# Plain text only
pdftract extract_text document.pdf -o output.txt
</code></pre>
<h3 id="does-pdftract-execute-javascript-embedded-in-pdfs"><a class="header" href="#does-pdftract-execute-javascript-embedded-in-pdfs">Does pdftract execute JavaScript embedded in PDFs?</a></h3>
<p><strong>No.</strong> pdftract never executes JavaScript embedded in PDFs. JavaScript is detected during parsing for security analysis, but it is never executed. This design prevents malicious PDFs from exploiting JavaScript vulnerabilities.</p>
<p>If you need to analyze JavaScript in PDFs, pdftract can detect and report its presence, but execution must be done separately with appropriate sandboxing.</p>
<h3 id="how-do-i-cite-an-extracted-snippet"><a class="header" href="#how-do-i-cite-an-extracted-snippet">How do I cite an extracted snippet?</a></h3>
<p>The JSON output from <code>pdftract extract</code> includes provenance information for each text block:</p>
<pre><code class="language-json">{
  "blocks": [{
    "spans": [{
      "text": "Example snippet",
      "bbox": [100.0, 200.0, 250.0, 215.0],
      "page": 3,
      "confidence": 0.98
    }]
  }],
  "metadata": {
    "path": "/path/to/document.pdf",
    "fingerprint": "sha256:abc123...",
    "extracted_at": "2026-05-25T12:00:00Z"
  }
}
</code></pre>
<p>For academic citations, include:</p>
<ul>
<li>Document path and fingerprint</li>
<li>Page number (from the <code>page</code> field)</li>
<li>Extraction timestamp</li>
<li>The pdftract version used</li>
</ul>
<hr>
<h2 id="installation-and-setup"><a class="header" href="#installation-and-setup">Installation and Setup</a></h2>
<h3 id="how-do-i-install-pdftract"><a class="header" href="#how-do-i-install-pdftract">How do I install pdftract?</a></h3>
<p>See the <a href="installation.html">Installation</a> guide for complete instructions. Quick summary:</p>
<p><strong>With cargo (Rust toolchain):</strong></p>
<pre><code class="language-bash">cargo install pdftract
</code></pre>
<p><strong>With pip (Python bindings):</strong></p>
<pre><code class="language-bash">pip install pdftract
</code></pre>
<p><strong>Pre-built binaries:</strong> Download from the <a href="https://github.com/your-org/pdftract/releases">releases page</a>.</p>
<h3 id="how-do-i-run-pdftract-behind-a-corporate-proxy"><a class="header" href="#how-do-i-run-pdftract-behind-a-corporate-proxy">How do I run pdftract behind a corporate proxy?</a></h3>
<p>pdftract doesn’t have built-in proxy support, but you can use the HTTP serve mode with a reverse proxy:</p>
<ol>
<li>Start pdftract in serve mode:</li>
</ol>
<pre><code class="language-bash">pdftract serve --port 8080
</code></pre>
<ol start="2">
<li>
<p>Configure your reverse proxy (nginx, Apache, etc.) to handle authentication and SSL termination.</p>
</li>
<li>
<p>Access pdftract through your proxy endpoint.</p>
</li>
</ol>
<p>See <a href="../operations/serve-deployment.html">Advanced Topics: HTTP Serve</a> for deployment guidance.</p>
<h3 id="what-are-the-system-requirements"><a class="header" href="#what-are-the-system-requirements">What are the system requirements?</a></h3>
<ul>
<li><strong>OS</strong>: Linux, macOS, or Windows</li>
<li><strong>Rust</strong>: 1.70+ (if building from source)</li>
<li><strong>Python</strong>: 3.8+ (for Python bindings)</li>
<li><strong>OCR (optional)</strong>: Tesseract 4.0+ for OCR fallback</li>
<li><strong>Memory</strong>: 512 MB minimum for typical PDFs; more for large documents</li>
</ul>
<hr>
<h2 id="usage"><a class="header" href="#usage">Usage</a></h2>
<h3 id="why-is-my-pdf-returning-broken_vector"><a class="header" href="#why-is-my-pdf-returning-broken_vector">Why is my PDF returning broken_vector?</a></h3>
<p>The <code>broken_vector</code> classification means the PDF’s text layer is unreliable or missing. Common causes:</p>
<ul>
<li><strong>Invisible text overlay</strong>: Text with rendering mode 3 (invisible) overlaid on a raster image</li>
<li><strong>Missing ToUnicode CMap</strong>: Font lacks character-to-Unicode mapping</li>
<li><strong>Encoding corruption</strong>: Character encodings don’t match the actual glyphs</li>
</ul>
<p><strong>Solution</strong>: pdftract automatically routes <code>broken_vector</code> pages to the OCR pipeline (Phase 5.5). If you see <code>broken_vector</code> without OCR output, check that OCR is enabled:</p>
<pre><code class="language-bash"># Verify OCR is available
pdftract doctor tesseract-langs

# Enable OCR explicitly if needed
pdftract extract document.pdf --enable-ocr
</code></pre>
<p>See <a href="troubleshooting/common-issues.html">Troubleshooting: Broken Vector</a> for more details.</p>
<h3 id="why-is-ocr-slow"><a class="header" href="#why-is-ocr-slow">Why is OCR slow?</a></h3>
<p>OCR performance depends on several factors:</p>
<ul>
<li><strong>Image resolution</strong>: Higher DPI images take longer to process</li>
<li><strong>Tesseract version</strong>: Version 4.0+ is significantly faster than 3.x</li>
<li><strong>Language data</strong>: Additional language packs increase processing time</li>
<li><strong>Hardware</strong>: CPU-bound; more cores help with batch processing</li>
</ul>
<p><strong>To speed up OCR:</strong></p>
<pre><code class="language-bash"># Reduce DPI (trade-off: accuracy)
pdftract extract document.pdf --ocr-dpi 200

# Use fewer languages
pdftract extract document.pdf --ocr-lang eng

# Disable OCR for vector-only PDFs
pdftract extract document.pdf --disable-ocr
</code></pre>
<h3 id="how-do-i-extract-text-from-a-specific-page-range"><a class="header" href="#how-do-i-extract-text-from-a-specific-page-range">How do I extract text from a specific page range?</a></h3>
<p>Use the <code>--pages</code> flag:</p>
<pre><code class="language-bash"># Single page
pdftract extract document.pdf --pages 5

# Range
pdftract extract document.pdf --pages 1-10

# Multiple ranges
pdftract extract document.pdf --pages 1-5,10,15-20

# All pages from page 5 onward
pdftract extract document.pdf --pages 5-
</code></pre>
<h3 id="how-do-i-extract-images-from-a-pdf"><a class="header" href="#how-do-i-extract-images-from-a-pdf">How do I extract images from a PDF?</a></h3>
<p>pdftract automatically detects and records image XObjects during content stream processing. The output JSON includes image metadata:</p>
<pre><code class="language-json">{
  "images": [{
    "bbox": [100.0, 200.0, 400.0, 500.0],
    "xobject_ref": "5 0 R",
    "name": "Im1"
  }]
}
</code></pre>
<p>For actual image extraction, use the <code>serve</code> mode with the <code>/images</code> endpoint or write a custom script using the Python SDK.</p>
<h3 id="can-i-process-multiple-pdfs-at-once"><a class="header" href="#can-i-process-multiple-pdfs-at-once">Can I process multiple PDFs at once?</a></h3>
<p>Yes, use shell wildcards or write a batch script:</p>
<pre><code class="language-bash"># Process all PDFs in a directory
for file in *.pdf; do
    pdftract extract "$file" -o "output/$(basename "$file" .json)"
done

# With parallel processing (GNU parallel)
ls *.pdf | parallel -j 4 pdftract extract {} -o output/{/.}.json
</code></pre>
<hr>
<h2 id="configuration"><a class="header" href="#configuration">Configuration</a></h2>
<h3 id="how-do-i-add-a-custom-profile"><a class="header" href="#how-do-i-add-a-custom-profile">How do I add a custom profile?</a></h3>
<p>Create a YAML file defining your profile:</p>
<pre><code class="language-yaml"># custom-profile.yaml
name: my_custom
description: "Custom extraction profile"

extraction:
  preserve_tables: true
  preserve_columns: true
  ocr_fallback: true

output:
  format: json
  include_provenance: true
  confidence_threshold: 0.7
</code></pre>
<p>Then use it:</p>
<pre><code class="language-bash">pdftract extract document.pdf --profile custom-profile.yaml
</code></pre>
<p>See <a href="profiles/custom.html">Custom Profiles</a> for complete documentation.</p>
<h3 id="how-do-i-adjust-ocr-accuracy"><a class="header" href="#how-do-i-adjust-ocr-accuracy">How do I adjust OCR accuracy?</a></h3>
<p>Adjust Tesseract parameters via environment variables or the OCR configuration:</p>
<pre><code class="language-bash"># Set OCR engine mode
export TESSERACT_OEM=1  # LSTM only
export TESSERACT_PSM=6  # Assume single column block of text

# Adjust page segmentation mode
pdftract extract document.pdf --tesseract-psm 6
</code></pre>
<p>Higher accuracy settings may slow down processing. See <a href="advanced/ocr.html">OCR Configuration</a> for details.</p>
<h3 id="how-do-i-disable-ocr-for-faster-processing"><a class="header" href="#how-do-i-disable-ocr-for-faster-processing">How do I disable OCR for faster processing?</a></h3>
<p>If you know your PDFs have reliable text layers:</p>
<pre><code class="language-bash">pdftract extract document.pdf --disable-ocr
</code></pre>
<p>Or set a confidence threshold to skip low-confidence text:</p>
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.9
</code></pre>
<h3 id="what-are-confidence-scores-and-how-do-i-use-them"><a class="header" href="#what-are-confidence-scores-and-how-do-i-use-them">What are confidence scores and how do I use them?</a></h3>
<p>Each text span has a <code>confidence</code> score (0.0 to 1.0):</p>
<ul>
<li><strong>1.0</strong>: High confidence (ToUnicode CMap lookup succeeded)</li>
<li><strong>0.3</strong>: Medium confidence (encoding + AGL fallback)</li>
<li><strong>0.0</strong>: No confidence (PositionHint mode or failed resolution)</li>
</ul>
<p>Filter by confidence:</p>
<pre><code class="language-bash">pdftract extract document.pdf --min-confidence 0.5
</code></pre>
<p>Or filter in post-processing using jq:</p>
<pre><code class="language-bash">pdftract extract document.pdf | jq '.blocks[].spans[] | select(.confidence &gt; 0.5)'
</code></pre>
<hr>
<h2 id="output-and-formats"><a class="header" href="#output-and-formats">Output and Formats</a></h2>
<h3 id="how-do-i-get-output-in-markdown-format"><a class="header" href="#how-do-i-get-output-in-markdown-format">How do I get output in Markdown format?</a></h3>
<p>Use the <code>--format</code> flag:</p>
<pre><code class="language-bash">pdftract extract document.pdf --format markdown -o output.md
</code></pre>
<p>The Markdown output preserves headings, lists, tables, and code blocks where detected.</p>
<h3 id="how-do-i-preserve-table-structure"><a class="header" href="#how-do-i-preserve-table-structure">How do I preserve table structure?</a></h3>
<p>pdftract includes table detection (Phase 4.2). Ensure table preservation is enabled:</p>
<pre><code class="language-bash">pdftract extract document.pdf --preserve-tables
</code></pre>
<p>Tables are output with structured cell information:</p>
<pre><code class="language-json">{
  "type": "table",
  "rows": 3,
  "columns": 4,
  "cells": [...]
}
</code></pre>
<h3 id="can-i-extract-metadata-from-pdfs"><a class="header" href="#can-i-extract-metadata-from-pdfs">Can I extract metadata from PDFs?</a></h3>
<p>Yes, metadata is automatically extracted and included in the output:</p>
<pre><code class="language-json">{
  "metadata": {
    "title": "Document Title",
    "author": "Author Name",
    "subject": "Subject",
    "keywords": ["keyword1", "keyword2"],
    "creator": "Application",
    "producer": "PDF Producer",
    "creation_date": "2026-01-01T00:00:00Z",
    "modified_date": "2026-05-25T12:00:00Z"
  }
}
</code></pre>
<h3 id="how-do-i-handle-password-protected-pdfs"><a class="header" href="#how-do-i-handle-password-protected-pdfs">How do I handle password-protected PDFs?</a></h3>
<p>Provide the password via the <code>--password</code> flag:</p>
<pre><code class="language-bash">pdftract extract document.pdf --password secret123
</code></pre>
<p>For security, avoid passing passwords on the command line in production. Use environment variables or a config file:</p>
<pre><code class="language-bash">export PDFTRACT_PASSWORD=secret123
pdftract extract document.pdf
</code></pre>
<hr>
<h2 id="troubleshooting"><a class="header" href="#troubleshooting">Troubleshooting</a></h2>
<h3 id="why-is-extraction-failing-with-an-error"><a class="header" href="#why-is-extraction-failing-with-an-error">Why is extraction failing with an error?</a></h3>
<p>Check the error message and consult the <a href="troubleshooting/README.html">Troubleshooting Guide</a>. Common issues:</p>
<ul>
<li><strong>Encrypted PDFs</strong>: Use <code>--password</code> to decrypt</li>
<li><strong>Corrupted PDFs</strong>: pdftract attempts recovery; check diagnostics</li>
<li><strong>Missing dependencies</strong>: Verify Tesseract and language packs are installed</li>
</ul>
<p>Run diagnostics:</p>
<pre><code class="language-bash">pdftract doctor
</code></pre>
<h3 id="why-is-my-output-empty-or-incomplete"><a class="header" href="#why-is-my-output-empty-or-incomplete">Why is my output empty or incomplete?</a></h3>
<p>Possible causes:</p>
<ol>
<li><strong>No text layer</strong>: PDF may be image-only. Enable OCR.</li>
<li><strong>Encoding issues</strong>: Check diagnostics for <code>FONT_GLYPH_UNMAPPED</code> warnings</li>
<li><strong>Page range issue</strong>: Verify your <code>--pages</code> argument</li>
<li><strong>Confidence filter</strong>: Lower <code>--min-confidence</code> if set too high</li>
</ol>
<p>Check diagnostics output:</p>
<pre><code class="language-bash">pdftract extract document.json --verbose
</code></pre>
<h3 id="how-do-i-debug-extraction-issues"><a class="header" href="#how-do-i-debug-extraction-issues">How do I debug extraction issues?</a></h3>
<p>Enable verbose output and diagnostics:</p>
<pre><code class="language-bash"># Full diagnostic output
pdftract extract document.pdf --verbose --diagnostics

# Save diagnostics for analysis
pdftract extract document.pdf --diagnostics -o diagnostics.json
</code></pre>
<p>Common diagnostic codes:</p>
<ul>
<li><code>FONT_GLYPH_UNMAPPED</code>: Glyph couldn’t be mapped to Unicode</li>
<li><code>STREAM_DECODE_ERROR</code>: Stream decompression failed</li>
<li><code>STRUCT_INVALID_TYPE</code>: Unexpected object type</li>
</ul>
<p>See <a href="troubleshooting/diagnostics.html">Diagnostics Reference</a> for a complete list.</p>
<h3 id="why-does-extraction-use-so-much-memory"><a class="header" href="#why-does-extraction-use-so-much-memory">Why does extraction use so much memory?</a></h3>
<p>Memory usage depends on:</p>
<ul>
<li><strong>PDF size</strong>: Larger PDFs with many images use more memory</li>
<li><strong>OCR</strong>: Tesseract loads image data into memory</li>
<li><strong>Output buffering</strong>: Large JSON outputs are buffered in memory</li>
</ul>
<p><strong>To reduce memory usage:</strong></p>
<pre><code class="language-bash"># Process page-by-page
for page in {1..100}; do
    pdftract extract document.pdf --pages $page -o "page-$page.json"
done

# Disable OCR if not needed
pdftract extract document.pdf --disable-ocr

# Stream output (if supported)
pdftract extract document.pdf --stream-output
</code></pre>
<hr>
<h2 id="still-have-questions"><a class="header" href="#still-have-questions">Still have questions?</a></h2>
<ul>
<li>Check the <a href="troubleshooting/README.html">Troubleshooting Guide</a></li>
<li>Review the <a href="cli/README.html">CLI Reference</a></li>
<li>Open an issue on <a href="https://github.com/your-org/pdftract/issues">GitHub</a></li>
</ul>

                    </main>

                    <nav class="nav-wrapper" aria-label="Page navigation">
                        <!-- Mobile navigation buttons -->
                            <a rel="prev" href="troubleshooting/performance.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                                <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
                            </a>


                        <div style="clear: both"></div>
                    </nav>
                </div>
            </div>

            <nav class="nav-wide-wrapper" aria-label="Page navigation">
                    <a rel="prev" href="troubleshooting/performance.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                        <span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
                    </a>

            </nav>

        </div>

        <template id=fa-eye><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M288 32c-80.8 0-145.5 36.8-192.6 80.6C48.6 156 17.3 208 2.5 243.7c-3.3 7.9-3.3 16.7 0 24.6C17.3 304 48.6 356 95.4 399.4C142.5 443.2 207.2 480 288 480s145.5-36.8 192.6-80.6c46.8-43.5 78.1-95.4 93-131.1c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C433.5 68.8 368.8 32 288 32zM432 256c0 79.5-64.5 144-144 144s-144-64.5-144-144s64.5-144 144-144s144 64.5 144 144zM288 192c0 35.3-28.7 64-64 64c-11.5 0-22.3-3-31.6-8.4c-.2 2.8-.4 5.5-.4 8.4c0 53 43 96 96 96s96-43 96-96s-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6z"/></svg></span></template>
        <template id=fa-eye-slash><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M38.8 5.1C28.4-3.1 13.3-1.2 5.1 9.2S-1.2 34.7 9.2 42.9l592 464c10.4 8.2 25.5 6.3 33.7-4.1s6.3-25.5-4.1-33.7L525.6 386.7c39.6-40.6 66.4-86.1 79.9-118.4c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C465.5 68.8 400.8 32 320 32c-68.2 0-125 26.3-169.3 60.8L38.8 5.1zM223.1 149.5C248.6 126.2 282.7 112 320 112c79.5 0 144 64.5 144 144c0 24.9-6.3 48.3-17.4 68.7L408 294.5c5.2-11.8 8-24.8 8-38.5c0-53-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6c0 10.2-2.4 19.8-6.6 28.3l-90.3-70.8zm223.1 298L373 389.9c-16.4 6.5-34.3 10.1-53 10.1c-79.5 0-144-64.5-144-144c0-6.9 .5-13.6 1.4-20.2L83.1 161.5C60.3 191.2 44 220.8 34.5 243.7c-3.3 7.9-3.3 16.7 0 24.6c14.9 35.7 46.2 87.7 93 131.1C174.5 443.2 239.2 480 320 480c47.8 0 89.9-12.9 126.2-32.5z"/></svg></span></template>
        <template id=fa-copy><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M502.6 70.63l-61.25-61.25C435.4 3.371 427.2 0 418.7 0H255.1c-35.35 0-64 28.66-64 64l.0195 256C192 355.4 220.7 384 256 384h192c35.2 0 64-28.8 64-64V93.25C512 84.77 508.6 76.63 502.6 70.63zM464 320c0 8.836-7.164 16-16 16H255.1c-8.838 0-16-7.164-16-16L239.1 64.13c0-8.836 7.164-16 16-16h128L384 96c0 17.67 14.33 32 32 32h47.1V320zM272 448c0 8.836-7.164 16-16 16H63.1c-8.838 0-16-7.164-16-16L47.98 192.1c0-8.836 7.164-16 16-16H160V128H63.99c-35.35 0-64 28.65-64 64l.0098 256C.002 483.3 28.66 512 64 512h192c35.2 0 64-28.8 64-64v-32h-47.1L272 448z"/></svg></span></template>
        <template id=fa-play><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M73 39c-14.8-9.1-33.4-9.4-48.5-.9S0 62.6 0 80V432c0 17.4 9.4 33.4 24.5 41.9s33.7 8.1 48.5-.9L361 297c14.3-8.7 23-24.2 23-41s-8.7-32.2-23-41L73 39z"/></svg></span></template>
        <template id=fa-clock-rotate-left><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M75 75L41 41C25.9 25.9 0 36.6 0 57.9V168c0 13.3 10.7 24 24 24H134.1c21.4 0 32.1-25.9 17-41l-30.8-30.8C155 85.5 203 64 256 64c106 0 192 86 192 192s-86 192-192 192c-40.8 0-78.6-12.7-109.7-34.4c-14.5-10.1-34.4-6.6-44.6 7.9s-6.6 34.4 7.9 44.6C151.2 495 201.7 512 256 512c141.4 0 256-114.6 256-256S397.4 0 256 0C185.3 0 121.3 28.7 75 75zm181 53c-13.3 0-24 10.7-24 24V256c0 6.4 2.5 12.5 7 17l72 72c9.4 9.4 24.6 9.4 33.9 0s9.4-24.6 0-33.9l-65-65V152c0-13.3-10.7-24-24-24z"/></svg></span></template>


        <script>
            window.playground_copyable = true;
        </script>


        <script src="elasticlunr-ef4e11c1.min.js"></script>
        <script src="mark-09e88c2c.min.js"></script>
        <script src="searcher-c2a407aa.js"></script>

        <script src="clipboard-1626706a.min.js"></script>
        <script src="highlight-abc7f01d.js"></script>
        <script src="book-a0b12cfe.js"></script>

        <!-- Custom JS scripts -->


    </div>
    </body>
</html>