pdftract/scripts/measure-doc-coverage.py

#!/usr/bin/env python3
"""
Measure rustdoc worked-example coverage for pdftract-core public API.

This script scans source files and counts:
1. Total public items (pub fn, pub struct, pub enum, pub trait, pub type, pub const, pub mod)
2. Public items with at least one ```rust example block in their doc comment

The coverage percentage is (items_with_examples / total_public_items) * 100.
Target: 80%+ coverage.
"""

import os
import re
from pathlib import Path
from dataclasses import dataclass
from typing import List, Set, Tuple

@dataclass
class DocCoverage:
    """Documentation coverage metrics."""
    total_items: int = 0
    items_with_docs: int = 0
    items_with_examples: int = 0
    items_by_type: dict = None

    def __post_init__(self):
        if self.items_by_type is None:
            self.items_by_type = {}

    def coverage_pct(self) -> float:
        """Return percentage of items with examples."""
        if self.total_items == 0:
            return 0.0
        return (self.items_with_examples / self.total_items) * 100


def extract_public_items(content: str) -> List[Tuple[str, str, int]]:
    """
    Extract public items from Rust source content.

    Returns list of (item_type, name, line_number).
    """
    items = []
    lines = content.split('\n')
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Skip comments and empty lines
        if stripped.startswith('//') or stripped.startswith('/*') or not stripped:
            i += 1
            continue

        # Check for public items
        if 'pub ' in stripped or stripped.startswith('pub('):
            # Extract item type and name
            if 'pub fn ' in stripped:
                match = re.search(r'pub\s+(?:unsafe\s+)?(?:async\s+)?fn\s+(\w+)', stripped)
                if match:
                    items.append(('fn', match.group(1), i + 1))
            elif 'pub struct ' in stripped:
                match = re.search(r'pub\s+struct\s+(\w+)', stripped)
                if match:
                    items.append(('struct', match.group(1), i + 1))
            elif 'pub enum ' in stripped:
                match = re.search(r'pub\s+enum\s+(\w+)', stripped)
                if match:
                    items.append(('enum', match.group(1), i + 1))
            elif 'pub trait ' in stripped:
                match = re.search(r'pub\s+trait\s+(\w+)', stripped)
                if match:
                    items.append(('trait', match.group(1), i + 1))
            elif 'pub type ' in stripped:
                match = re.search(r'pub\s+type\s+(\w+)', stripped)
                if match:
                    items.append(('type', match.group(1), i + 1))
            elif 'pub const ' in stripped:
                match = re.search(r'pub\s+const\s+(\w+)', stripped)
                if match:
                    items.append(('const', match.group(1), i + 1))
            elif 'pub mod ' in stripped:
                match = re.search(r'pub\s+mod\s+(\w+)', stripped)
                if match:
                    items.append(('mod', match.group(1), i + 1))
            elif re.search(r'pub\s+use\s+.*;', stripped):
                # Skip pub use statements (re-exports)
                pass

        i += 1

    return items


def find_doc_comment_for_item(lines: List[str], item_line: int) -> str:
    """
    Find the doc comment for an item at the given line.

    Returns the full doc comment text (multiple lines).
    """
    # Look backwards from the item line for doc comments
    doc_lines = []
    i = item_line - 2  # Convert to 0-index and start before the item

    while i >= 0:
        line = lines[i].rstrip()
        if line.startswith('///'):
            doc_lines.insert(0, line[3:])  # Remove '///'
        elif line.startswith('//!'):
            doc_lines.insert(0, line[3:])  # Remove '//!'
        elif line.strip() and not (line.startswith('//') or line.strip() == '*'):
            # End of doc comment block
            break
        i -= 1

    return '\n'.join(doc_lines)


def has_rust_example(doc_comment: str) -> bool:
    """Check if a doc comment contains a ```rust example block."""
    return '```rust' in doc_comment


def measure_file_coverage(filepath: Path) -> DocCoverage:
    """Measure documentation coverage for a single Rust source file."""
    content = filepath.read_text()
    lines = content.split('\n')
    items = extract_public_items(content)

    coverage = DocCoverage()
    coverage.total_items = len(items)

    for item_type, item_name, item_line in items:
        doc_comment = find_doc_comment_for_item(lines, item_line)

        # Track items by type
        if item_type not in coverage.items_by_type:
            coverage.items_by_type[item_type] = {'total': 0, 'with_examples': 0}
        coverage.items_by_type[item_type]['total'] += 1

        if doc_comment:
            coverage.items_with_docs += 1
            if has_rust_example(doc_comment):
                coverage.items_with_examples += 1
                coverage.items_by_type[item_type]['with_examples'] += 1

    return coverage


def main():
    """Main entry point."""
    src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')

    if not src_dir.exists():
        print(f"Error: Source directory not found: {src_dir}")
        return 1

    # Find all .rs files
    rs_files = list(src_dir.rglob('*.rs'))

    total_coverage = DocCoverage()

    print(f"Scanning {len(rs_files)} Rust source files in {src_dir}...")
    print()

    for filepath in sorted(rs_files):
        relative_path = filepath.relative_to(src_dir)
        coverage = measure_file_coverage(filepath)

        if coverage.total_items > 0:
            print(f"{relative_path}: {coverage.items_with_examples}/{coverage.total_items} items with examples ({coverage.coverage_pct():.1f}%)")
            total_coverage.total_items += coverage.total_items
            total_coverage.items_with_docs += coverage.items_with_docs
            total_coverage.items_with_examples += coverage.items_with_examples

            # Merge type counts
            for item_type, counts in coverage.items_by_type.items():
                if item_type not in total_coverage.items_by_type:
                    total_coverage.items_by_type[item_type] = {'total': 0, 'with_examples': 0}
                total_coverage.items_by_type[item_type]['total'] += counts['total']
                total_coverage.items_by_type[item_type]['with_examples'] += counts['with_examples']

    print()
    print("=" * 60)
    print("TOTAL COVERAGE")
    print("=" * 60)
    print(f"Public items with doc comments: {total_coverage.items_with_docs}/{total_coverage.total_items} ({(total_coverage.items_with_docs/total_coverage.total_items*100):.1f}%)")
    print(f"Public items with examples: {total_coverage.items_with_examples}/{total_coverage.total_items} ({total_coverage.coverage_pct():.1f}%)")
    print()

    print("Breakdown by item type:")
    for item_type in sorted(total_coverage.items_by_type.keys()):
        counts = total_coverage.items_by_type[item_type]
        pct = (counts['with_examples'] / counts['total'] * 100) if counts['total'] > 0 else 0
        print(f"  {item_type:8s}: {counts['with_examples']:4d}/{counts['total']:4d} ({pct:5.1f}%)")

    print()
    target_pct = 80.0
    if total_coverage.coverage_pct() >= target_pct:
        print(f"✓ PASS: {total_coverage.coverage_pct():.1f}% >= {target_pct}% target")
        return 0
    else:
        print(f"✗ FAIL: {total_coverage.coverage_pct():.1f}% < {target_pct}% target (need {target_pct - total_coverage.coverage_pct():.1f}% more)")
        return 1


if __name__ == '__main__':
    exit(main())