pdftract/scripts/doc_coverage.rs
jedarden f85e5149dd feat(pdftract-91e1i): HTTP fetch sequence implementation
Implement orchestration layer connecting HttpRangeSource to Phase 1.3
xref resolver and Phase 1.4 document model for remote PDF access:

- Document::open_remote() public API for remote PDF loading
- Progressive tail fetch (16 KB → 1 MB) for startxref location
- Xref forward-scan disabled for remote sources (via is_remote check)
- Page-by-page on-demand fetch via HttpRangeSource caching
- Resource lazy load through XrefResolver cache
- HEAD probe with 405 fallback, no Content-Length handling

Acceptance criteria:
 open_remote(url) returns Document with correct page count
 HEAD failure modes (405, no Content-Length, 401) handled
 xref forward-scan disabled for remote (is_remote check)
 Page-by-page on-demand fetch (HttpRangeSource LRU cache)
 INV-8 maintained (all errors return Result)

Files modified:
- crates/pdftract-core/src/document.rs (Document::open_remote, from_source)
- crates/pdftract-core/src/remote.rs (progressive tail fetch)
- crates/pdftract-core/src/lib.rs (re-exports)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 13:17:00 -04:00

152 lines
5.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""Count public items in pdftract-core and measure documentation coverage."""
import subprocess
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
def run_cargo_doc() -> str:
"""Run cargo doc and capture output."""
result = subprocess.run(
["cargo", "doc", "--no-deps", "--all-features", "-p", "pdftract-core"],
cwd=Path("/home/coding/pdftract"),
capture_output=True,
text=True
)
return result.stdout + result.stderr
def has_example(doc: str) -> bool:
"""Check if documentation contains a code example."""
if not doc:
return False
# Look for ```rust, ```no_run, ```ignore, etc.
return bool(re.search(r'```rust', doc))
def extract_docs_from_file(file_path: Path) -> List[Tuple[str, str, bool, str]]:
"""Extract public items and their docs from a Rust file."""
items = []
content = file_path.read_text()
lines = content.split('\n')
# Track current doc comment being built
current_doc = []
doc_line_start = 0
for i, line in enumerate(lines):
stripped = line.strip()
# Check for doc comments
if stripped.startswith("///"):
current_doc.append(stripped[3:].strip())
if not doc_line_start:
doc_line_start = i + 1
elif stripped.startswith("//!"):
# Module-level doc - skip for item-level tracking
pass
elif stripped.startswith("//"):
# Regular comment - skip
pass
else:
# Check if this is a public item declaration
if current_doc:
pub_match = re.match(r'pub\b\s*(fn|struct|enum|trait|type|const|static|mod)\b\s*(\w+)?', stripped)
if pub_match:
item_type = pub_match.group(1)
item_name = pub_match.group(2) or f"anon_{i}"
doc_text = "\n".join(current_doc)
items.append((item_type, item_name, has_example(doc_text), file_path.name))
current_doc = []
doc_line_start = 0
return items
def main():
"""Main entry point."""
print("Checking pdftract-core documentation coverage...\n")
# First, run cargo doc to check for warnings
print("Running cargo doc --no-deps --all-features...")
result = subprocess.run(
["cargo", "doc", "--no-deps", "--all-features", "-p", "pdftract-core"],
cwd=Path("/home/coding/pdftract"),
capture_output=True,
text=True
)
has_warnings = "warning:" in result.stdout or "warning:" in result.stderr
has_missing_docs = "missing documentation" in result.stdout or "missing documentation" in result.stderr
if has_warnings:
print("⚠️ Warnings found:")
for line in (result.stdout + result.stderr).split('\n'):
if 'warning:' in line or 'warning:' in line.lower():
print(f" {line.strip()}")
elif has_missing_docs:
print("❌ Missing documentation warnings found")
else:
print("✅ No warnings - cargo doc passes!")
print("\nScanning source files for public items with examples...")
src_dir = Path("/home/coding/pdftract/crates/pdftract-core/src")
all_items: List[Tuple[str, str, bool, str]] = []
for rs_file in src_dir.rglob("*.rs"):
if rs_file.name == "lib.rs":
continue # Already well-documented
items = extract_docs_from_file(rs_file)
all_items.extend(items)
# Count by category
total_items = len(all_items)
items_with_examples = sum(1 for _, _, has_ex, _ in all_items if has_ex)
coverage = (items_with_examples / total_items * 100) if total_items > 0 else 0
print(f"\n📊 Documentation Coverage:")
print(f" Total public items: {total_items}")
print(f" With examples: {items_with_examples}")
print(f" Coverage: {coverage:.1f}%")
# Show items without examples by type
by_type: Dict[str, List[Tuple[str, bool, str]]] = {}
for item_type, item_name, has_ex, file_name in all_items:
if item_type not in by_type:
by_type[item_type] = []
by_type[item_type].append((item_name, has_ex, file_name))
print(f"\n📋 By item type:")
for item_type, items in sorted(by_type.items()):
with_ex = sum(1 for _, h, _ in items if h)
total = len(items)
cov = (with_ex / total * 100) if total > 0 else 0
print(f" {item_type}: {with_ex}/{total} ({cov:.0f}%)")
# Find high-value modules needing examples
print(f"\n🔍 High-value modules needing examples:")
high_value_modules = [
"extract.rs", "document.rs", "parser/mod.rs", "span/mod.rs",
"table/mod.rs", "layout/mod.rs", "output/mod.rs"
]
for mod_name in high_value_modules:
mod_items = [(t, n, h) for t, n, h, f in all_items if f == mod_name]
if mod_items:
with_ex = sum(1 for _, _, h in mod_items if h)
total = len(mod_items)
cov = (with_ex / total * 100) if total > 0 else 0
if cov < 80:
print(f" {mod_name}: {with_ex}/{total} ({cov:.0f}%)")
# Check against threshold
if coverage >= 80:
print(f"\n✅ PASS: {coverage:.1f}% >= 80% threshold")
return 0
else:
print(f"\n❌ FAIL: {coverage:.1f}% < 80% threshold")
print(f" Need {int((80 - coverage) / 100 * total_items)} more items with examples")
return 1
if __name__ == "__main__":
exit(main())