pdftract/crates/pdftract-py/python/pdftract/asyncio.py
jedarden fca8966f45 feat(pdftract-2nu0s): implement Python SDK contract conformance
Implements the Python SDK with all 9 contract methods, 8 exception
classes, type definitions, asyncio wrappers, and subprocess fallback.

Changes:
- Add Python wrapper module with extract, extract_text, extract_markdown,
  extract_stream, search, get_metadata, hash, classify, verify_receipt
- Add exception hierarchy: PdftractError base class with 7 subclasses
- Add dataclass type definitions: Document, Page, Span, Block, Match,
  Fingerprint, Classification, Metadata
- Add asyncio module with async wrappers for 4 long-running methods
- Add subprocess fallback for when native module fails to import
- Add conformance test runner under tests/test_conformance.py
- Update pyproject.toml with dynamic version from Cargo

Closes: pdftract-2nu0s
2026-05-24 08:55:11 -04:00

264 lines
7.7 KiB
Python

"""Asyncio wrappers for pdftract.
This module provides async versions of the long-running pdftract methods
using asyncio.to_thread to offload work to a thread pool.
"""
from __future__ import annotations
import asyncio
from typing import Any, Iterator, Optional
from pdftract.types import Document, Fingerprint, Match, Metadata, Page
class AsyncExtractor:
"""Async wrapper for pdftract extraction methods.
This class provides async versions of the long-running extraction
methods that block on I/O or CPU-intensive work.
"""
def __init__(self):
"""Initialize the async extractor."""
import pdftract
self._pdftract = pdftract
async def extract(self, source: str, **options) -> Document:
"""Async version of pdftract.extract.
Offloads extraction to a thread pool to avoid blocking the event loop.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
Document: Extracted document
"""
return await asyncio.to_thread(self._pdftract.extract, source, **options)
async def extract_text(self, source: str, **options) -> str:
"""Async version of pdftract.extract_text.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
str: Extracted text
"""
return await asyncio.to_thread(self._pdftract.extract_text, source, **options)
async def extract_markdown(self, source: str, **options) -> str:
"""Async version of pdftract.extract_markdown.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
str: Extracted Markdown
"""
return await asyncio.to_thread(
self._pdftract.extract_markdown, source, **options
)
async def extract_stream(self, source: str, **options) -> AsyncPageIterator:
"""Async version of pdftract.extract_stream.
Returns an async iterator that yields pages.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
AsyncPageIterator: Async iterator yielding pages
"""
sync_iterator = self._pdftract.extract_stream(source, **options)
return AsyncPageIterator(sync_iterator)
async def search(self, source: str, pattern: str, **options) -> AsyncMatchIterator:
"""Async version of pdftract.search.
Returns an async iterator that yields matches.
Args:
source: Path to PDF file or URL
pattern: Regex pattern to search for
**options: Extraction options
Returns:
AsyncMatchIterator: Async iterator yielding matches
"""
sync_iterator = self._pdftract.search(source, pattern, **options)
return AsyncMatchIterator(sync_iterator)
async def get_metadata(self, source: str, **options) -> Metadata:
"""Async version of pdftract.get_metadata.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
Metadata: Document metadata
"""
return await asyncio.to_thread(self._pdftract.get_metadata, source, **options)
async def hash(self, source: str, **options) -> Fingerprint:
"""Async version of pdftract.hash.
Args:
source: Path to PDF file or URL
**options: Extraction options
Returns:
Fingerprint: Document fingerprint
"""
return await asyncio.to_thread(self._pdftract.hash, source, **options)
async def classify(self, source: str) -> Any:
"""Async version of pdftract.classify.
Args:
source: Path to PDF file or URL
Returns:
Classification result
"""
return await asyncio.to_thread(self._pdftract.classify, source)
async def verify_receipt(self, path: str, receipt: dict) -> bool:
"""Async version of pdftract.verify_receipt.
Args:
path: Path to PDF file
receipt: Receipt dict
Returns:
bool: True if receipt verifies
"""
return await asyncio.to_thread(self._pdftract.verify_receipt, path, receipt)
class AsyncPageIterator:
"""Async iterator wrapper for sync page iterators."""
def __init__(self, sync_iterator: Iterator[Page]):
"""Initialize the async iterator.
Args:
sync_iterator: Synchronous page iterator
"""
self._sync_iterator = sync_iterator
def __aiter__(self) -> "AsyncPageIterator":
"""Return self as async iterator."""
return self
async def __anext__(self) -> Page:
"""Get the next page asynchronously."""
try:
return await asyncio.to_thread(next, self._sync_iterator)
except StopIteration:
raise StopAsyncIteration
class AsyncMatchIterator:
"""Async iterator wrapper for sync match iterators."""
def __init__(self, sync_iterator: Iterator[Match]):
"""Initialize the async iterator.
Args:
sync_iterator: Synchronous match iterator
"""
self._sync_iterator = sync_iterator
def __aiter__(self) -> "AsyncMatchIterator":
"""Return self as async iterator."""
return self
async def __anext__(self) -> Match:
"""Get the next match asynchronously."""
try:
return await asyncio.to_thread(next, self._sync_iterator)
except StopIteration:
raise StopAsyncIteration
# Module-level async extractor instance
_extractor: Optional[AsyncExtractor] = None
def _get_async_extractor() -> AsyncExtractor:
"""Get or create the module-level async extractor."""
global _extractor
if _extractor is None:
_extractor = AsyncExtractor()
return _extractor
# Export async functions
async def extract(source: str, **options) -> Document:
"""Async version of pdftract.extract."""
return await _get_async_extractor().extract(source, **options)
async def extract_text(source: str, **options) -> str:
"""Async version of pdftract.extract_text."""
return await _get_async_extractor().extract_text(source, **options)
async def extract_markdown(source: str, **options) -> str:
"""Async version of pdftract.extract_markdown."""
return await _get_async_extractor().extract_markdown(source, **options)
async def extract_stream(source: str, **options) -> AsyncPageIterator:
"""Async version of pdftract.extract_stream."""
return await _get_async_extractor().extract_stream(source, **options)
async def search(source: str, pattern: str, **options) -> AsyncMatchIterator:
"""Async version of pdftract.search."""
return await _get_async_extractor().search(source, pattern, **options)
async def get_metadata(source: str, **options) -> Metadata:
"""Async version of pdftract.get_metadata."""
return await _get_async_extractor().get_metadata(source, **options)
async def hash(source: str, **options) -> Fingerprint:
"""Async version of pdftract.hash."""
return await _get_async_extractor().hash(source, **options)
async def classify(source: str) -> Any:
"""Async version of pdftract.classify."""
return await _get_async_extractor().classify(source)
async def verify_receipt(path: str, receipt: dict) -> bool:
"""Async version of pdftract.verify_receipt."""
return await _get_async_extractor().verify_receipt(path, receipt)
__all__ = [
"AsyncExtractor",
"AsyncPageIterator",
"AsyncMatchIterator",
"extract",
"extract_text",
"extract_markdown",
"extract_stream",
"search",
"get_metadata",
"hash",
"classify",
"verify_receipt",
]