pdftract/scripts/doc_coverage.py

#!/usr/bin/env python3
"""
Measure rustdoc coverage for pdftract-core.

This script counts:
- Total public items (pub fn/struct/enum/trait/type/const)
- Items with /// doc comments (excluding module-level //!)
- Items with worked examples (```rust blocks)

Usage:
    python3 scripts/doc_coverage.py
"""
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple

PUBLIC_ITEM_RE = re.compile(r'^pub (fn|struct|enum|trait|type|const|mod)\s+(\w+)')
DOC_COMMENT_RE = re.compile(r'^///')
EXAMPLE_RE = re.compile(r'```rust[^`]*```', re.MULTILINE)

def count_public_items(filepath: Path) -> Tuple[int, int, int]:
    """Count public items, doc comments, and examples in a file."""
    content = filepath.read_text()
    lines = content.split('\n')

    total_items = 0
    with_doc = 0
    with_example = 0

    i = 0
    while i < len(lines):
        line = lines[i]

        # Check for public items
        match = PUBLIC_ITEM_RE.match(line)
        if match:
            total_items += 1
            item_type, name = match.groups()

            # Look back for doc comments (///, not //!)
            has_doc = False
            has_example = False
            j = i - 1
            doc_lines = []
            while j >= 0 and (lines[j].startswith('///') or lines[j].strip() == '' or lines[j].startswith('//!')):
                if lines[j].startswith('///'):
                    has_doc = True
                    doc_lines.append(lines[j])
                j -= 1

            # Look ahead for doc comments (/// style after attrs)
            if not has_doc:
                j = i + 1
                while j < len(lines) and (lines[j].startswith('///') or lines[j].strip() == ''):
                    if lines[j].startswith('///'):
                        has_doc = True
                        doc_lines.append(lines[j])
                    j += 1

            if has_doc:
                with_doc += 1
                # Check for examples in the accumulated doc lines
                doc_text = '\n'.join(doc_lines)
                if EXAMPLE_RE.search(doc_text):
                    with_example += 1

        i += 1

    return total_items, with_doc, with_example


def main():
    core_src = Path('/home/coding/pdftract/crates/pdftract-core/src')

    total_items = 0
    total_with_doc = 0
    total_with_example = 0

    file_counts: Dict[str, Tuple[int, int, int]] = {}

    for rs_file in core_src.rglob('*.rs'):
        if 'parser/primitives' in str(rs_file):
            continue  # Skip generated files

        items, docs, examples = count_public_items(rs_file)
        if items > 0:
            file_counts[str(rs_file.relative_to(core_src))] = (items, docs, examples)
            total_items += items
            total_with_doc += docs
            total_with_example += examples

    print(f"pdftract-core Documentation Coverage")
    print(f"=" * 60)
    print(f"Total public items: {total_items}")
    print(f"Items with doc comments: {total_with_doc} ({100 * total_with_doc / total_items:.1f}%)")
    print(f"Items with worked examples: {total_with_example} ({100 * total_with_example / total_items:.1f}%)")
    print()

    # Top 20 files by public item count
    print("Top 20 files needing documentation:")
    sorted_files = sorted(
        file_counts.items(),
        key=lambda x: (x[1][0] - x[1][1], x[1][0]),  # Sort by undocumented count, then total
        reverse=True
    )
    for rel_path, (items, docs, examples) in sorted_files[:20]:
        coverage = 100 * docs / items if items > 0 else 0
        print(f"  {coverage:5.1f}% ({items:3d} items, {docs:3d} docs, {examples:3d} examples) {rel_path}")


if __name__ == '__main__':
    main()