Implement orchestration layer connecting HttpRangeSource to Phase 1.3 xref resolver and Phase 1.4 document model for remote PDF access: - Document::open_remote() public API for remote PDF loading - Progressive tail fetch (16 KB → 1 MB) for startxref location - Xref forward-scan disabled for remote sources (via is_remote check) - Page-by-page on-demand fetch via HttpRangeSource caching - Resource lazy load through XrefResolver cache - HEAD probe with 405 fallback, no Content-Length handling Acceptance criteria: ✅ open_remote(url) returns Document with correct page count ✅ HEAD failure modes (405, no Content-Length, 401) handled ✅ xref forward-scan disabled for remote (is_remote check) ✅ Page-by-page on-demand fetch (HttpRangeSource LRU cache) ✅ INV-8 maintained (all errors return Result) Files modified: - crates/pdftract-core/src/document.rs (Document::open_remote, from_source) - crates/pdftract-core/src/remote.rs (progressive tail fetch) - crates/pdftract-core/src/lib.rs (re-exports) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Measure rustdoc coverage for pdftract-core.
|
|
|
|
This script counts:
|
|
- Total public items (pub fn/struct/enum/trait/type/const)
|
|
- Items with /// doc comments (excluding module-level //!)
|
|
- Items with worked examples (```rust blocks)
|
|
|
|
Usage:
|
|
python3 scripts/doc_coverage.py
|
|
"""
|
|
import re
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
PUBLIC_ITEM_RE = re.compile(r'^pub (fn|struct|enum|trait|type|const|mod)\s+(\w+)')
|
|
DOC_COMMENT_RE = re.compile(r'^///')
|
|
EXAMPLE_RE = re.compile(r'```rust[^`]*```', re.MULTILINE)
|
|
|
|
def count_public_items(filepath: Path) -> Tuple[int, int, int]:
|
|
"""Count public items, doc comments, and examples in a file."""
|
|
content = filepath.read_text()
|
|
lines = content.split('\n')
|
|
|
|
total_items = 0
|
|
with_doc = 0
|
|
with_example = 0
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check for public items
|
|
match = PUBLIC_ITEM_RE.match(line)
|
|
if match:
|
|
total_items += 1
|
|
item_type, name = match.groups()
|
|
|
|
# Look back for doc comments (///, not //!)
|
|
has_doc = False
|
|
has_example = False
|
|
j = i - 1
|
|
doc_lines = []
|
|
while j >= 0 and (lines[j].startswith('///') or lines[j].strip() == '' or lines[j].startswith('//!')):
|
|
if lines[j].startswith('///'):
|
|
has_doc = True
|
|
doc_lines.append(lines[j])
|
|
j -= 1
|
|
|
|
# Look ahead for doc comments (/// style after attrs)
|
|
if not has_doc:
|
|
j = i + 1
|
|
while j < len(lines) and (lines[j].startswith('///') or lines[j].strip() == ''):
|
|
if lines[j].startswith('///'):
|
|
has_doc = True
|
|
doc_lines.append(lines[j])
|
|
j += 1
|
|
|
|
if has_doc:
|
|
with_doc += 1
|
|
# Check for examples in the accumulated doc lines
|
|
doc_text = '\n'.join(doc_lines)
|
|
if EXAMPLE_RE.search(doc_text):
|
|
with_example += 1
|
|
|
|
i += 1
|
|
|
|
return total_items, with_doc, with_example
|
|
|
|
|
|
def main():
|
|
core_src = Path('/home/coding/pdftract/crates/pdftract-core/src')
|
|
|
|
total_items = 0
|
|
total_with_doc = 0
|
|
total_with_example = 0
|
|
|
|
file_counts: Dict[str, Tuple[int, int, int]] = {}
|
|
|
|
for rs_file in core_src.rglob('*.rs'):
|
|
if 'parser/primitives' in str(rs_file):
|
|
continue # Skip generated files
|
|
|
|
items, docs, examples = count_public_items(rs_file)
|
|
if items > 0:
|
|
file_counts[str(rs_file.relative_to(core_src))] = (items, docs, examples)
|
|
total_items += items
|
|
total_with_doc += docs
|
|
total_with_example += examples
|
|
|
|
print(f"pdftract-core Documentation Coverage")
|
|
print(f"=" * 60)
|
|
print(f"Total public items: {total_items}")
|
|
print(f"Items with doc comments: {total_with_doc} ({100 * total_with_doc / total_items:.1f}%)")
|
|
print(f"Items with worked examples: {total_with_example} ({100 * total_with_example / total_items:.1f}%)")
|
|
print()
|
|
|
|
# Top 20 files by public item count
|
|
print("Top 20 files needing documentation:")
|
|
sorted_files = sorted(
|
|
file_counts.items(),
|
|
key=lambda x: (x[1][0] - x[1][1], x[1][0]), # Sort by undocumented count, then total
|
|
reverse=True
|
|
)
|
|
for rel_path, (items, docs, examples) in sorted_files[:20]:
|
|
coverage = 100 * docs / items if items > 0 else 0
|
|
print(f" {coverage:5.1f}% ({items:3d} items, {docs:3d} docs, {examples:3d} examples) {rel_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|