"""Conformance tests for pdftract Python SDK. This module runs the shared conformance suite via the Python API and reports per-case pass/fail results. """ from __future__ import annotations import json import os import sys import time from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any import pytest # Import pdftract try: import pdftract from pdftract import ( Document, EncryptionError, Page, PdftractError, extract, extract_text, ) _native_available = True except ImportError as e: pytest.skip(f"pdftract not available: {e}", allow_module_level=True) _native_available = False # Test fixtures directory FIXTURES_DIR = Path(__file__).parent.parent.parent / "tests" / "fixtures" class TestConformance: """Conformance tests for the pdftract Python SDK.""" def test_extract_basic(self): """Test basic extraction returns a Document with correct structure.""" fixture_path = FIXTURES_DIR / "valid-minimal.pdf" if not fixture_path.exists(): pytest.skip(f"Fixture not found: {fixture_path}") result = pdftract.extract(str(fixture_path)) # Should return a Document object (not a raw dict) assert isinstance(result, Document), f"Expected Document, got {type(result)}" # Should have metadata assert hasattr(result, "metadata") assert result.metadata.page_count >= 1 # Should have pages assert hasattr(result, "pages") assert len(result.pages) >= 1 # Each page should be a Page object for page in result.pages: assert isinstance(page, Page), f"Expected Page, got {type(page)}" assert hasattr(page, "page_index") assert hasattr(page, "spans") assert hasattr(page, "blocks") def test_extract_text_returns_string(self): """Test extract_text returns a plain-text string.""" fixture_path = FIXTURES_DIR / "valid-minimal.pdf" if not fixture_path.exists(): pytest.skip(f"Fixture not found: {fixture_path}") result = pdftract.extract_text(str(fixture_path)) # Should return a string assert isinstance(result, str), f"Expected str, got {type(result)}" # Should not be empty for valid PDF # (minimal.pdf may have no text, so we just check it doesn't error) assert isinstance(result, str) def test_extract_nonexistent_raises_error(self): """Test extract with nonexistent path raises PdftractError.""" with pytest.raises(PdftractError): pdftract.extract("/nonexistent/path/that/does/not/exist.pdf") def test_exception_hierarchy(self): """Test that all exception classes are defined and inherit correctly.""" # Base exception assert hasattr(pdftract, "PdftractError") assert issubclass(pdftract.PdftractError, Exception) # Specific exceptions should inherit from PdftractError assert hasattr(pdftract, "CorruptPdfError") assert issubclass(pdftract.CorruptPdfError, pdftract.PdftractError) assert hasattr(pdftract, "EncryptionError") assert issubclass(pdftract.EncryptionError, pdftract.PdftractError) assert hasattr(pdftract, "SourceUnreachableError") assert issubclass(pdftract.SourceUnreachableError, pdftract.PdftractError) assert hasattr(pdftract, "RemoteFetchInterruptedError") assert issubclass(pdftract.RemoteFetchInterruptedError, pdftract.PdftractError) assert hasattr(pdftract, "TlsError") assert issubclass(pdftract.TlsError, pdftract.PdftractError) assert hasattr(pdftract, "ReceiptVerifyError") assert issubclass(pdftract.ReceiptVerifyError, pdftract.PdftractError) assert hasattr(pdftract, "UnsupportedOperationError") assert issubclass(pdftract.UnsupportedOperationError, pdftract.PdftractError) def test_types_are_dataclasses(self): """Test that type definitions are frozen dataclasses.""" from dataclasses import is_dataclass # Document type assert hasattr(pdftract, "Document") assert is_dataclass(pdftract.Document) # Page type assert hasattr(pdftract, "Page") assert is_dataclass(pdftract.Page) # Span type assert hasattr(pdftract, "Span") assert is_dataclass(pdftract.Span) # Block type assert hasattr(pdftract, "Block") assert is_dataclass(pdftract.Block) # Match type assert hasattr(pdftract, "Match") assert is_dataclass(pdftract.Match) # Fingerprint type assert hasattr(pdftract, "Fingerprint") assert is_dataclass(pdftract.Fingerprint) # Classification type assert hasattr(pdftract, "Classification") assert is_dataclass(pdftract.Classification) # Metadata type assert hasattr(pdftract, "Metadata") assert is_dataclass(pdftract.Metadata) def test_extract_stream_returns_iterator(self): """Test extract_stream returns an iterator of Page objects.""" fixture_path = FIXTURES_DIR / "valid-minimal.pdf" if not fixture_path.exists(): pytest.skip(f"Fixture not found: {fixture_path}") result = pdftract.extract_stream(str(fixture_path)) # Should return an iterator assert hasattr(result, "__iter__") # Should yield Page objects pages = list(result) assert len(pages) >= 1 assert all(isinstance(p, Page) for p in pages) def test_extract_with_options(self): """Test extract with various options.""" fixture_path = FIXTURES_DIR / "valid-minimal.pdf" if not fixture_path.exists(): pytest.skip(f"Fixture not found: {fixture_path}") # Test with boolean option result = pdftract.extract(str(fixture_path), include_invisible=True) assert isinstance(result, Document) # Test with list option result = pdftract.extract(str(fixture_path), ocr_language=["eng"]) assert isinstance(result, Document) # Test with numeric option result = pdftract.extract(str(fixture_path), max_decompress_gb=2) assert isinstance(result, Document) def test_asyncio_module_exists(self): """Test that asyncio module is available.""" assert hasattr(pdftract, "asyncio") # Check for key async functions assert hasattr(pdftract.asyncio, "extract") assert hasattr(pdftract.asyncio, "extract_text") assert hasattr(pdftract.asyncio, "extract_stream") @pytest.mark.asyncio async def test_asyncio_extract(self): """Test asyncio.extract works.""" fixture_path = FIXTURES_DIR / "valid-minimal.pdf" if not fixture_path.exists(): pytest.skip(f"Fixture not found: {fixture_path}") result = await pdftract.asyncio.extract(str(fixture_path)) assert isinstance(result, Document) def test_version_defined(self): """Test that __version__ is defined.""" assert hasattr(pdftract, "__version__") assert isinstance(pdftract.__version__, str) def test_gil_released_during_extraction(self): """Critical test #5 (plan line 2093): Python threading test. 4 threads each extracting different PDFs simultaneously; no deadlock. Wallclock time should be < (4 * single-extract-time) / 2 to prove parallelism. """ # Find some test PDFs test_pdfs = [ FIXTURES_DIR / "tagged-suspects-true-high-coverage.pdf", FIXTURES_DIR / "tagged-suspects-false.pdf", FIXTURES_DIR / "page_class" / "vector_pure" / "source.pdf", FIXTURES_DIR / "page_class" / "hybrid_header_body" / "source.pdf", ] # Filter to only existing PDFs existing_pdfs = [p for p in test_pdfs if p.exists()] if len(existing_pdfs) < 2: pytest.skip(f"Need at least 2 PDFs for parallelism test, found {len(existing_pdfs)}") # Measure single-threaded time (sequential) start = time.time() for pdf_path in existing_pdfs: pdftract.extract(str(pdf_path)) sequential_time = time.time() - start # Measure multi-threaded time (parallel) start = time.time() with ThreadPoolExecutor(max_workers=len(existing_pdfs)) as executor: list(executor.map(lambda p: pdftract.extract(str(p)), existing_pdfs)) parallel_time = time.time() - start # Parallel time should be significantly less than sequential time # For 4 PDFs, ideal parallelism is 4x, so we expect at least 2x speedup # The criterion is: parallel_time < (4 * sequential_time) / 2 = 2 * sequential_time # This is a very weak check (basically just ensuring we're not 4x slower) max_expected_time = 2.0 * sequential_time speedup = sequential_time / parallel_time if parallel_time > 0 else 0 assert parallel_time < max_expected_time, ( f"GIL not properly released: parallel_time={parallel_time:.3f}s, " f"sequential_time={sequential_time:.3f}s, max_expected={max_expected_time:.3f}s, " f"speedup={speedup:.2f}x" ) print(f"GIL release test: sequential={sequential_time:.3f}s, parallel={parallel_time:.3f}s, speedup={speedup:.2f}x") class TestSubprocessFallback: """Tests for subprocess fallback when native module is unavailable.""" def test_fallback_module_exists(self): """Test that fallback module can be imported.""" from pdftract.fallback import SubprocessExtractor assert SubprocessExtractor is not None def test_fallback_extractor_finds_cli(self): """Test that SubprocessExtractor can find the CLI binary.""" from pdftract.fallback import SubprocessExtractor # This may fail if pdftract is not installed, but we test # the logic works try: extractor = SubprocessExtractor() assert extractor.cli_path is not None except PdftractError: # CLI not found, which is OK for this test pass def run_conformance_suite() -> dict[str, Any]: """Run the conformance suite and return results. Returns: Dict with pass/fail counts and details """ import traceback results = { "total": 0, "passed": 0, "failed": 0, "skipped": 0, "tests": [], } # Get all test methods test_class = TestConformance test_methods = [ getattr(test_class, name) for name in dir(test_class) if name.startswith("test_") and callable(getattr(test_class, name)) ] for test_method in test_methods: test_name = test_method.__name__ results["total"] += 1 try: test_instance = test_class() test_method() results["passed"] += 1 results["tests"].append({"name": test_name, "status": "PASS"}) except pytest.skip.Exception as e: results["skipped"] += 1 results["tests"].append({"name": test_name, "status": "SKIP", "reason": str(e)}) except Exception as e: results["failed"] += 1 results["tests"].append( { "name": test_name, "status": "FAIL", "error": str(e), "traceback": traceback.format_exc(), } ) return results if __name__ == "__main__": # Run conformance suite when executed directly print("Running pdftract Python SDK conformance suite...") print() results = run_conformance_suite() print(f"Results: {results['passed']}/{results['total']} passed") print(f" Passed: {results['passed']}") print(f" Failed: {results['failed']}") print(f" Skipped: {results['skipped']}") print() # Print failed tests if results["failed"] > 0: print("Failed tests:") for test in results["tests"]: if test["status"] == "FAIL": print(f" - {test['name']}: {test.get('error', 'Unknown error')}") print() # Print summary as JSON for CI print(json.dumps(results, indent=2)) # Exit with error code if any tests failed sys.exit(0 if results["failed"] == 0 else 1)