pdftract/scripts/doc_coverage.py

#!/usr/bin/env python3
"""Measure rustdoc coverage for pdftract-core public API."""

import os
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple

RUST_KEYWORDS = {
    'where', 'let', 'mut', 'if', 'else', 'for', 'while', 'loop', 'match',
    'return', 'break', 'continue', 'impl', 'struct', 'enum', 'trait',
    'type', 'fn', 'const', 'static', 'mod', 'use', 'crate', 'super',
    'self', 'Self', 'extern', 'unsafe', 'async', 'await', 'move',
    'ref', 'True', 'False', 'Some', 'None', 'Ok', 'Err', 'Vec',
    'String', 'Box', 'Result', 'Option', 'u8', 'u16', 'u32', 'u64',
    'i8', 'i16', 'i32', 'i64', 'f32', 'f64', 'bool', 'usize', 'isize'
}


def extract_items_from_file(filepath: Path) -> List[Tuple[str, str, int, bool]]:
    """Extract public items from a Rust source file.

    Returns: List of (name, kind, line_number, has_example) tuples.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    items = []
    lines = content.split('\n')

    # Track current doc comment for next item
    pending_doc = None

    for i, line in enumerate(lines, 1):
        stripped = line.strip()

        # Skip empty lines and non-doc comments
        if not stripped or stripped.startswith('//') and not stripped.startswith('///'):
            if stripped.startswith('//') and not stripped.startswith('///'):
                pending_doc = None
            continue

        # Track doc comments
        if stripped.startswith('///'):
            if pending_doc is None:
                pending_doc = []
            pending_doc.append(stripped)
            continue

        # Check for attribute lines (cfg, derive, etc.) - don't reset doc
        if stripped.startswith('#['):
            continue

        # Check for pub items
        if stripped.startswith('pub '):
            # Extract item kind and name
            kind_match = re.search(r'pub (fn|struct|enum|trait|type|const|mod|use)\s+(\w+)', stripped)
            if not kind_match:
                # Handle complex cases like `pub use foo::Bar;`
                use_match = re.search(r'pub use\s+(.+?);', stripped)
                if use_match:
                    item_name = use_match.group(1).split('::')[-1].rstrip(';')
                    kind = 'use'
                else:
                    continue
            else:
                kind = kind_match.group(1)
                item_name = kind_match.group(2)

            # Skip known items that are re-exports
            if item_name in RUST_KEYWORDS:
                pending_doc = None
                continue

            # Check if doc has examples
            has_example = False
            if pending_doc:
                doc_text = '\n'.join(pending_doc)
                has_example = '```rust' in doc_text or '```no_run' in doc_text

            items.append((item_name, kind, i, has_example))
            pending_doc = None

        # Reset doc if we encounter something else
        elif stripped and not stripped.startswith('#') and not stripped.startswith('use'):
            pending_doc = None

    return items


def scan_directory(src_dir: Path) -> Dict[str, List[Tuple[str, str, int, bool]]]:
    """Scan all Rust files in a directory."""
    all_items = {}

    for rust_file in src_dir.rglob('*.rs'):
        # Skip test files and tests modules
        if 'tests.rs' in rust_file.name or 'test_' in rust_file.name:
            continue
        if any(p.startswith('test') or p == 'benches' for p in rust_file.parts):
            continue

        relative = rust_file.relative_to(src_dir)
        module_path = str(relative.with_suffix(''))

        items = extract_items_from_file(rust_file)
        if items:
            all_items[module_path] = items

    return all_items


def print_report(all_items: Dict[str, List[Tuple[str, str, int, bool]]]):
    """Print coverage report."""
    total = 0
    with_examples = 0
    by_kind = defaultdict(lambda: [0, 0])  # kind -> [total, with_examples]

    print("=" * 80)
    print("RUSTDOC COVERAGE REPORT")
    print("=" * 80)

    for module_path in sorted(all_items.keys()):
        items = all_items[module_path]
        if not items:
            continue

        module_total = len(items)
        module_with = sum(1 for _, _, _, has_ex in items if has_ex)
        module_pct = (module_with / module_total * 100) if module_total else 0

        print(f"\n{module_path}:")
        print(f"  {module_with}/{module_total} items with examples ({module_pct:.1f}%)")

        # List missing examples
        missing = [name for name, kind, _, has_ex in items if not has_ex and kind in ('fn', 'struct', 'enum', 'trait', 'type')]
        if missing:
            print(f"  Missing examples: {', '.join(missing[:10])}", end='')
            if len(missing) > 10:
                print(f" ... and {len(missing) - 10} more")
            else:
                print()

        total += module_total
        with_examples += module_with

        for _, kind, _, has_ex in items:
            by_kind[kind][0] += 1
            if has_ex:
                by_kind[kind][1] += 1

    overall_pct = (with_examples / total * 100) if total else 0
    print("\n" + "=" * 80)
    print(f"OVERALL: {with_examples}/{total} items with examples ({overall_pct:.1f}%)")
    print("=" * 80)

    print("\nBy kind:")
    for kind in sorted(by_kind.keys()):
        t, w = by_kind[kind]
        pct = (w / t * 100) if t else 0
        print(f"  {kind:10s}: {w:4d}/{t:4d} ({pct:5.1f}%)")

    # Threshold check
    print("\n" + "=" * 80)
    if overall_pct >= 80:
        print("PASS: Meets 80% threshold")
    else:
        print(f"FAIL: Below 80% threshold (need {int((0.8 * total) - with_examples)} more examples)")
    print("=" * 80)


if __name__ == '__main__':
    src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')
    all_items = scan_directory(src_dir)
    print_report(all_items)