Implement orchestration layer connecting HttpRangeSource to Phase 1.3 xref resolver and Phase 1.4 document model for remote PDF access: - Document::open_remote() public API for remote PDF loading - Progressive tail fetch (16 KB → 1 MB) for startxref location - Xref forward-scan disabled for remote sources (via is_remote check) - Page-by-page on-demand fetch via HttpRangeSource caching - Resource lazy load through XrefResolver cache - HEAD probe with 405 fallback, no Content-Length handling Acceptance criteria: ✅ open_remote(url) returns Document with correct page count ✅ HEAD failure modes (405, no Content-Length, 401) handled ✅ xref forward-scan disabled for remote (is_remote check) ✅ Page-by-page on-demand fetch (HttpRangeSource LRU cache) ✅ INV-8 maintained (all errors return Result) Files modified: - crates/pdftract-core/src/document.rs (Document::open_remote, from_source) - crates/pdftract-core/src/remote.rs (progressive tail fetch) - crates/pdftract-core/src/lib.rs (re-exports) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
#!/usr/bin/env python3
|
|
import zlib
|
|
import sys
|
|
|
|
def debug_file(path, name):
|
|
with open(path, 'rb') as f:
|
|
data = f.read()
|
|
print(f"\n=== {name} ===")
|
|
print(f"File: {path}")
|
|
print(f"Length: {len(data)} bytes")
|
|
print(f"Hex (first 64 bytes): {data[:64].hex()}")
|
|
|
|
# Try to decompress if it looks like zlib
|
|
if data[:2] == b'\x78\x9c':
|
|
try:
|
|
decompressed = zlib.decompress(data)
|
|
print(f"Decompressed: {len(decompressed)} bytes")
|
|
print(f"Decompressed data: {decompressed[:100]}")
|
|
except Exception as e:
|
|
print(f"Decompress error: {e}")
|
|
|
|
# Try to decode as LZW
|
|
if data[0:1] == b'\x08':
|
|
print(f"Looks like LZW (min code size=8)")
|
|
print(f"LZW data: {data[1:]}")
|
|
|
|
# Debug failing fixtures
|
|
fixtures = [
|
|
("/home/coding/pdftract/tests/stream_decoder/fixtures/flate_png_pred15_all_six.bin", "PNG predictor"),
|
|
("/home/coding/pdftract/tests/stream_decoder/fixtures/flate_truncated.bin", "Truncated"),
|
|
("/home/coding/pdftract/tests/stream_decoder/fixtures/lzw_early_change_0.bin", "LZW EarlyChange 0"),
|
|
("/home/coding/pdftract/tests/stream_decoder/fixtures/ascii85_terminator.bin", "ASCII85 terminator"),
|
|
]
|
|
|
|
for path, name in fixtures:
|
|
debug_file(path, name)
|