Implements the Python SDK with all 9 contract methods, 8 exception classes, type definitions, asyncio wrappers, and subprocess fallback. Changes: - Add Python wrapper module with extract, extract_text, extract_markdown, extract_stream, search, get_metadata, hash, classify, verify_receipt - Add exception hierarchy: PdftractError base class with 7 subclasses - Add dataclass type definitions: Document, Page, Span, Block, Match, Fingerprint, Classification, Metadata - Add asyncio module with async wrappers for 4 long-running methods - Add subprocess fallback for when native module fails to import - Add conformance test runner under tests/test_conformance.py - Update pyproject.toml with dynamic version from Cargo Closes: pdftract-2nu0s
89 lines
2 KiB
Python
89 lines
2 KiB
Python
"""Exception hierarchy for pdftract.
|
|
|
|
All pdftract exceptions inherit from PdftractError.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
class PdftractError(Exception):
|
|
"""Base exception for all pdftract errors.
|
|
|
|
This is raised when extraction fails for reasons not covered
|
|
by more specific exception types.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class CorruptPdfError(PdftractError):
|
|
"""Raised when the PDF file is corrupted or malformed.
|
|
|
|
This indicates the PDF structure is invalid or the file
|
|
is not a valid PDF document.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class EncryptionError(PdftractError):
|
|
"""Raised when a PDF is encrypted and no password was provided,
|
|
or the provided password is incorrect.
|
|
|
|
Supply the correct password via the `password` option:
|
|
pdftract.extract("encrypted.pdf", password="secret")
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class SourceUnreachableError(PdftractError):
|
|
"""Raised when the PDF source (file or URL) cannot be accessed.
|
|
|
|
For files: check the path and file permissions.
|
|
For URLs: check network connectivity and URL validity.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class RemoteFetchInterruptedError(PdftractError):
|
|
"""Raised when a remote fetch is interrupted.
|
|
|
|
This can happen due to network timeouts, connection drops,
|
|
or server issues during URL fetching.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class TlsError(PdftractError):
|
|
"""Raised when TLS/SSL certificate validation fails.
|
|
|
|
This indicates a problem with the HTTPS connection,
|
|
such as an invalid certificate or TLS protocol mismatch.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class ReceiptVerifyError(PdftractError):
|
|
"""Raised when receipt verification fails.
|
|
|
|
This can happen when:
|
|
- The PDF fingerprint doesn't match
|
|
- No span has sufficient bbox overlap
|
|
- The content hash doesn't match
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class UnsupportedOperationError(PdftractError):
|
|
"""Raised when calling a method not supported by the binary version.
|
|
|
|
This can happen when using features added in newer binary versions
|
|
with an older binary.
|
|
"""
|
|
|
|
pass
|