pdftract/scripts/doc_coverage.py
jedarden f85e5149dd feat(pdftract-91e1i): HTTP fetch sequence implementation
Implement orchestration layer connecting HttpRangeSource to Phase 1.3
xref resolver and Phase 1.4 document model for remote PDF access:

- Document::open_remote() public API for remote PDF loading
- Progressive tail fetch (16 KB → 1 MB) for startxref location
- Xref forward-scan disabled for remote sources (via is_remote check)
- Page-by-page on-demand fetch via HttpRangeSource caching
- Resource lazy load through XrefResolver cache
- HEAD probe with 405 fallback, no Content-Length handling

Acceptance criteria:
 open_remote(url) returns Document with correct page count
 HEAD failure modes (405, no Content-Length, 401) handled
 xref forward-scan disabled for remote (is_remote check)
 Page-by-page on-demand fetch (HttpRangeSource LRU cache)
 INV-8 maintained (all errors return Result)

Files modified:
- crates/pdftract-core/src/document.rs (Document::open_remote, from_source)
- crates/pdftract-core/src/remote.rs (progressive tail fetch)
- crates/pdftract-core/src/lib.rs (re-exports)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 13:17:00 -04:00

113 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Measure rustdoc coverage for pdftract-core.
This script counts:
- Total public items (pub fn/struct/enum/trait/type/const)
- Items with /// doc comments (excluding module-level //!)
- Items with worked examples (```rust blocks)
Usage:
python3 scripts/doc_coverage.py
"""
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple
PUBLIC_ITEM_RE = re.compile(r'^pub (fn|struct|enum|trait|type|const|mod)\s+(\w+)')
DOC_COMMENT_RE = re.compile(r'^///')
EXAMPLE_RE = re.compile(r'```rust[^`]*```', re.MULTILINE)
def count_public_items(filepath: Path) -> Tuple[int, int, int]:
"""Count public items, doc comments, and examples in a file."""
content = filepath.read_text()
lines = content.split('\n')
total_items = 0
with_doc = 0
with_example = 0
i = 0
while i < len(lines):
line = lines[i]
# Check for public items
match = PUBLIC_ITEM_RE.match(line)
if match:
total_items += 1
item_type, name = match.groups()
# Look back for doc comments (///, not //!)
has_doc = False
has_example = False
j = i - 1
doc_lines = []
while j >= 0 and (lines[j].startswith('///') or lines[j].strip() == '' or lines[j].startswith('//!')):
if lines[j].startswith('///'):
has_doc = True
doc_lines.append(lines[j])
j -= 1
# Look ahead for doc comments (/// style after attrs)
if not has_doc:
j = i + 1
while j < len(lines) and (lines[j].startswith('///') or lines[j].strip() == ''):
if lines[j].startswith('///'):
has_doc = True
doc_lines.append(lines[j])
j += 1
if has_doc:
with_doc += 1
# Check for examples in the accumulated doc lines
doc_text = '\n'.join(doc_lines)
if EXAMPLE_RE.search(doc_text):
with_example += 1
i += 1
return total_items, with_doc, with_example
def main():
core_src = Path('/home/coding/pdftract/crates/pdftract-core/src')
total_items = 0
total_with_doc = 0
total_with_example = 0
file_counts: Dict[str, Tuple[int, int, int]] = {}
for rs_file in core_src.rglob('*.rs'):
if 'parser/primitives' in str(rs_file):
continue # Skip generated files
items, docs, examples = count_public_items(rs_file)
if items > 0:
file_counts[str(rs_file.relative_to(core_src))] = (items, docs, examples)
total_items += items
total_with_doc += docs
total_with_example += examples
print(f"pdftract-core Documentation Coverage")
print(f"=" * 60)
print(f"Total public items: {total_items}")
print(f"Items with doc comments: {total_with_doc} ({100 * total_with_doc / total_items:.1f}%)")
print(f"Items with worked examples: {total_with_example} ({100 * total_with_example / total_items:.1f}%)")
print()
# Top 20 files by public item count
print("Top 20 files needing documentation:")
sorted_files = sorted(
file_counts.items(),
key=lambda x: (x[1][0] - x[1][1], x[1][0]), # Sort by undocumented count, then total
reverse=True
)
for rel_path, (items, docs, examples) in sorted_files[:20]:
coverage = 100 * docs / items if items > 0 else 0
print(f" {coverage:5.1f}% ({items:3d} items, {docs:3d} docs, {examples:3d} examples) {rel_path}")
if __name__ == '__main__':
main()