pdftract/crates/pdftract-core/scripts/measure-doc-coverage.py

#!/usr/bin/env python3
"""
Measure rustdoc coverage for pdftract-core.

Counts:
- Total public items (pub fn/struct/enum/trait/type/const/mod)
- Items with doc comments (/// or //!)
- Items with worked examples (```rust code blocks)

Usage: python3 scripts/measure-doc-coverage.py
"""

import os
import re
from pathlib import Path
from typing import Dict, List, Tuple

# Simple Rust parser for extracting public items
def extract_public_items(file_path: Path) -> List[Tuple[str, str, str, List[str]]]:
    """
    Extract public items from a Rust source file.

    Returns: List of (item_type, name, doc_comment, location)
    """
    items = []
    content = file_path.read_text()
    lines = content.split('\n')

    # Track preceding doc comments
    doc_comment = []

    for i, line in enumerate(lines, 1):
        stripped = line.strip()

        # Collect doc comments
        if stripped.startswith('///') or stripped.startswith('//!'):
            doc_comment.append(stripped)
            continue
        elif doc_comment and (stripped.startswith('//') or stripped == ''):
            # Allow blank lines and regular comments within doc blocks
            continue
        elif not stripped or stripped.startswith('//') or stripped.startswith('#'):
            # Reset if we hit a blank line without a pub item
            if not stripped.startswith('#'):
                doc_comment = []
            continue

        # Check for public items
        if stripped.startswith('pub '):
            # Parse the item
            item_type = None
            name = None

            if 'pub fn ' in stripped:
                item_type = 'fn'
                match = re.search(r'pub\s+fn\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub struct ' in stripped:
                item_type = 'struct'
                match = re.search(r'pub\s+struct\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub enum ' in stripped:
                item_type = 'enum'
                match = re.search(r'pub\s+enum\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub trait ' in stripped:
                item_type = 'trait'
                match = re.search(r'pub\s+trait\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub type ' in stripped:
                item_type = 'type'
                match = re.search(r'pub\s+type\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub const ' in stripped:
                item_type = 'const'
                match = re.search(r'pub\s+const\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub mod ' in stripped:
                item_type = 'mod'
                match = re.search(r'pub\s+mod\s+(\w+)', stripped)
                if match:
                    name = match.group(1)
            elif 'pub use ' in stripped:
                # Skip re-exports for now (they inherit docs from the original)
                doc_comment = []
                continue

            if name:
                items.append((
                    item_type,
                    name,
                    '\n'.join(doc_comment),
                    f"{file_path.relative_to('/home/coding/pdftract/crates/pdftract-core/src')}:{i}"
                ))

        doc_comment = []

    return items


def has_worked_example(doc: str) -> bool:
    """Check if doc comment contains a worked example (```rust block)."""
    if not doc:
        return False
    return '```rust' in doc or '```rust,no_run' in doc or '```rust,ignore' in doc


def measure_coverage(src_dir: Path) -> Dict:
    """Measure documentation coverage across all source files."""
    results = {
        'total_items': 0,
        'with_docs': 0,
        'with_examples': 0,
        'by_type': {},
        'items_missing_examples': [],
    }

    for rs_file in src_dir.rglob('*.rs'):
        # Skip tests directory
        if 'tests' in str(rs_file):
            continue

        items = extract_public_items(rs_file)

        for item_type, name, doc, location in items:
            results['total_items'] += 1

            if item_type not in results['by_type']:
                results['by_type'][item_type] = {
                    'total': 0,
                    'with_docs': 0,
                    'with_examples': 0,
                }

            results['by_type'][item_type]['total'] += 1

            if doc:
                results['with_docs'] += 1
                results['by_type'][item_type]['with_docs'] += 1

            if has_worked_example(doc):
                results['with_examples'] += 1
                results['by_type'][item_type]['with_examples'] += 1
            else:
                results['items_missing_examples'].append((item_type, name, location))

    return results


def main():
    src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')
    results = measure_coverage(src_dir)

    total = results['total_items']
    with_docs = results['with_docs']
    with_examples = results['with_examples']

    doc_coverage = (with_docs / total * 100) if total > 0 else 0
    example_coverage = (with_examples / total * 100) if total > 0 else 0

    print(f"=== Rustdoc Coverage Report for pdftract-core ===\n")
    print(f"Total public items: {total}")
    print(f"With documentation: {with_docs} ({doc_coverage:.1f}%)")
    print(f"With worked examples: {with_examples} ({example_coverage:.1f}%)")
    print()

    print("By item type:")
    for item_type, stats in sorted(results['by_type'].items()):
        t_total = stats['total']
        t_docs = stats['with_docs']
        t_examples = stats['with_examples']
        t_doc_cov = (t_docs / t_total * 100) if t_total > 0 else 0
        t_ex_cov = (t_examples / t_total * 100) if t_total > 0 else 0
        print(f"  {item_type:8s}: {t_examples:3d}/{t_total:3d} with examples ({t_ex_cov:.0f}%)")

    print()

    if example_coverage < 80.0:
        print(f"⚠️  Target: 80% coverage. Current: {example_coverage:.1f}%")
        print(f"    Need {int(total * 0.8 - with_examples)} more examples.\n")

        # Show first 20 items missing examples
        missing = results['items_missing_examples'][:20]
        print(f"First 20 items missing examples (showing {len(missing)} of {len(results['items_missing_examples'])}):")
        for item_type, name, location in missing:
            print(f"  - {item_type:8s} {name:30s} ({location})")

        if len(results['items_missing_examples']) > 20:
            print(f"  ... and {len(results['items_missing_examples']) - 20} more")
    else:
        print(f"✅ Target met: {example_coverage:.1f}% >= 80%")


if __name__ == '__main__':
    main()