pdftract/scripts/rustdoc_coverage.py

#!/usr/bin/env python3
"""
Script to analyze rustdoc coverage in pdftract-core.

Measures:
- Total public items (pub fn, pub struct, pub enum, pub trait, pub type)
- Public items with documentation
- Public items with worked examples (```rust blocks)
"""
import subprocess
import re
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class ModuleStats:
    total: int = 0
    with_doc: int = 0
    with_example: int = 0
    items: List[str] = None

    def __post_init__(self):
        if self.items is None:
            self.items = []

def run_rg(pattern: str, path: Path) -> str:
    """Run ripgrep and return output."""
    result = subprocess.run(
        ["rg", pattern, str(path), "-n", "-A", "10", "--type", "rust"],
        capture_output=True,
        text=True,
        cwd="/home/coding/pdftract"
    )
    return result.stdout

def analyze_module(module_path: Path) -> ModuleStats:
    """Analyze a single module file for rustdoc coverage."""
    stats = ModuleStats()

    content = module_path.read_text()
    lines = content.split("\n")

    # Track public items
    for i, line in enumerate(lines):
        # Look for pub items
        for pattern in [
            r"pub\s+fn\s+(\w+)",
            r"pub\s+struct\s+(\w+)",
            r"pub\s+enum\s+(\w+)",
            r"pub\s+trait\s+(\w+)",
            r"pub\s+type\s+(\w+)",
            r"pub\s+mod\s+(\w+)",
        ]:
            match = re.search(pattern, line)
            if match:
                item_name = match.group(1)
                stats.total += 1
                stats.items.append(f"{line.strip()}:{i+1}")

                # Check for documentation above
                has_doc = False
                has_example = False

                # Look back up to 20 lines for doc comments
                for j in range(max(0, i - 20), i):
                    prev_line = lines[j].strip()
                    if prev_line.startswith("///") or prev_line.startswith("//!"):
                        has_doc = True
                        # Check for example within doc
                        if "```rust" in prev_line or "```rust,no_run" in prev_line or "```ignore" in prev_line:
                            has_example = True
                        # Also check a few lines after the doc start
                        for k in range(j+1, min(j+10, i)):
                            if "```rust" in lines[k]:
                                has_example = True
                    elif not prev_line.startswith("//") and prev_line and not prev_line.startswith("#"):
                        # Stop if we hit something that's not a comment
                        if j < i - 1 and lines[j+1].strip().startswith("#"):
                            continue
                        if j < i - 2:
                            break

                if has_doc:
                    stats.with_doc += 1
                if has_example:
                    stats.with_example += 1

    return stats

def main():
    """Main analysis function."""
    src_dir = Path("/home/coding/pdftract/crates/pdftract-core/src")

    print(f"Analyzing rustdoc coverage for pdftract-core")
    print(f"=" * 60)

    total_stats = ModuleStats()
    module_stats: Dict[str, ModuleStats] = {}

    # Analyze each module
    for rs_file in sorted(src_dir.rglob("*.rs")):
        # Skip main.rs and test files
        if "tests" in str(rs_file) or rs_file.name == "main.rs":
            continue

        # Get module name from path
        rel_path = rs_file.relative_to(src_dir)
        if str(rel_path) == "lib.rs":
            continue

        module_name = str(rel_path).replace("/", "::").replace(".rs", "")
        stats = analyze_module(rs_file)

        if stats.total > 0:
            module_stats[module_name] = stats
            total_stats.total += stats.total
            total_stats.with_doc += stats.with_doc
            total_stats.with_example += stats.with_example

    # Print report
    print(f"\nOverall Coverage:")
    print(f"  Total public items: {total_stats.total}")
    print(f"  With documentation: {total_stats.with_doc} ({100*total_stats.with_doc/total_stats.total:.1f}%)")
    print(f"  With examples: {total_stats.with_example} ({100*total_stats.with_example/total_stats.total:.1f}%)")
    print()

    print(f"Top modules by public items:")
    sorted_modules = sorted(module_stats.items(), key=lambda x: x[1].total, reverse=True)[:15]
    for name, stats in sorted_modules:
        doc_pct = 100 * stats.with_doc / stats.total if stats.total > 0 else 0
        ex_pct = 100 * stats.with_example / stats.total if stats.total > 0 else 0
        print(f"  {name:50s} items:{stats.total:3d} docs:{doc_pct:5.1f}% examples:{ex_pct:5.1f}%")

if __name__ == "__main__":
    main()