Implement orchestration layer connecting HttpRangeSource to Phase 1.3 xref resolver and Phase 1.4 document model for remote PDF access: - Document::open_remote() public API for remote PDF loading - Progressive tail fetch (16 KB → 1 MB) for startxref location - Xref forward-scan disabled for remote sources (via is_remote check) - Page-by-page on-demand fetch via HttpRangeSource caching - Resource lazy load through XrefResolver cache - HEAD probe with 405 fallback, no Content-Length handling Acceptance criteria: ✅ open_remote(url) returns Document with correct page count ✅ HEAD failure modes (405, no Content-Length, 401) handled ✅ xref forward-scan disabled for remote (is_remote check) ✅ Page-by-page on-demand fetch (HttpRangeSource LRU cache) ✅ INV-8 maintained (all errors return Result) Files modified: - crates/pdftract-core/src/document.rs (Document::open_remote, from_source) - crates/pdftract-core/src/remote.rs (progressive tail fetch) - crates/pdftract-core/src/lib.rs (re-exports) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
137 lines
4.7 KiB
Python
137 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to analyze rustdoc coverage in pdftract-core.
|
|
|
|
Measures:
|
|
- Total public items (pub fn, pub struct, pub enum, pub trait, pub type)
|
|
- Public items with documentation
|
|
- Public items with worked examples (```rust blocks)
|
|
"""
|
|
import subprocess
|
|
import re
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List
|
|
|
|
@dataclass
|
|
class ModuleStats:
|
|
total: int = 0
|
|
with_doc: int = 0
|
|
with_example: int = 0
|
|
items: List[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.items is None:
|
|
self.items = []
|
|
|
|
def run_rg(pattern: str, path: Path) -> str:
|
|
"""Run ripgrep and return output."""
|
|
result = subprocess.run(
|
|
["rg", pattern, str(path), "-n", "-A", "10", "--type", "rust"],
|
|
capture_output=True,
|
|
text=True,
|
|
cwd="/home/coding/pdftract"
|
|
)
|
|
return result.stdout
|
|
|
|
def analyze_module(module_path: Path) -> ModuleStats:
|
|
"""Analyze a single module file for rustdoc coverage."""
|
|
stats = ModuleStats()
|
|
|
|
content = module_path.read_text()
|
|
lines = content.split("\n")
|
|
|
|
# Track public items
|
|
for i, line in enumerate(lines):
|
|
# Look for pub items
|
|
for pattern in [
|
|
r"pub\s+fn\s+(\w+)",
|
|
r"pub\s+struct\s+(\w+)",
|
|
r"pub\s+enum\s+(\w+)",
|
|
r"pub\s+trait\s+(\w+)",
|
|
r"pub\s+type\s+(\w+)",
|
|
r"pub\s+mod\s+(\w+)",
|
|
]:
|
|
match = re.search(pattern, line)
|
|
if match:
|
|
item_name = match.group(1)
|
|
stats.total += 1
|
|
stats.items.append(f"{line.strip()}:{i+1}")
|
|
|
|
# Check for documentation above
|
|
has_doc = False
|
|
has_example = False
|
|
|
|
# Look back up to 20 lines for doc comments
|
|
for j in range(max(0, i - 20), i):
|
|
prev_line = lines[j].strip()
|
|
if prev_line.startswith("///") or prev_line.startswith("//!"):
|
|
has_doc = True
|
|
# Check for example within doc
|
|
if "```rust" in prev_line or "```rust,no_run" in prev_line or "```ignore" in prev_line:
|
|
has_example = True
|
|
# Also check a few lines after the doc start
|
|
for k in range(j+1, min(j+10, i)):
|
|
if "```rust" in lines[k]:
|
|
has_example = True
|
|
elif not prev_line.startswith("//") and prev_line and not prev_line.startswith("#"):
|
|
# Stop if we hit something that's not a comment
|
|
if j < i - 1 and lines[j+1].strip().startswith("#"):
|
|
continue
|
|
if j < i - 2:
|
|
break
|
|
|
|
if has_doc:
|
|
stats.with_doc += 1
|
|
if has_example:
|
|
stats.with_example += 1
|
|
|
|
return stats
|
|
|
|
def main():
|
|
"""Main analysis function."""
|
|
src_dir = Path("/home/coding/pdftract/crates/pdftract-core/src")
|
|
|
|
print(f"Analyzing rustdoc coverage for pdftract-core")
|
|
print(f"=" * 60)
|
|
|
|
total_stats = ModuleStats()
|
|
module_stats: Dict[str, ModuleStats] = {}
|
|
|
|
# Analyze each module
|
|
for rs_file in sorted(src_dir.rglob("*.rs")):
|
|
# Skip main.rs and test files
|
|
if "tests" in str(rs_file) or rs_file.name == "main.rs":
|
|
continue
|
|
|
|
# Get module name from path
|
|
rel_path = rs_file.relative_to(src_dir)
|
|
if str(rel_path) == "lib.rs":
|
|
continue
|
|
|
|
module_name = str(rel_path).replace("/", "::").replace(".rs", "")
|
|
stats = analyze_module(rs_file)
|
|
|
|
if stats.total > 0:
|
|
module_stats[module_name] = stats
|
|
total_stats.total += stats.total
|
|
total_stats.with_doc += stats.with_doc
|
|
total_stats.with_example += stats.with_example
|
|
|
|
# Print report
|
|
print(f"\nOverall Coverage:")
|
|
print(f" Total public items: {total_stats.total}")
|
|
print(f" With documentation: {total_stats.with_doc} ({100*total_stats.with_doc/total_stats.total:.1f}%)")
|
|
print(f" With examples: {total_stats.with_example} ({100*total_stats.with_example/total_stats.total:.1f}%)")
|
|
print()
|
|
|
|
print(f"Top modules by public items:")
|
|
sorted_modules = sorted(module_stats.items(), key=lambda x: x[1].total, reverse=True)[:15]
|
|
for name, stats in sorted_modules:
|
|
doc_pct = 100 * stats.with_doc / stats.total if stats.total > 0 else 0
|
|
ex_pct = 100 * stats.with_example / stats.total if stats.total > 0 else 0
|
|
print(f" {name:50s} items:{stats.total:3d} docs:{doc_pct:5.1f}% examples:{ex_pct:5.1f}%")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|