pdftract/crates/pdftract-core/scripts/doc_coverage.py

#!/usr/bin/env python3
"""Analyze rustdoc coverage for pdftract-core."""

import os
import re
from pathlib import Path
from collections import defaultdict

# Patterns for public API items
PUB_PATTERNS = {
    'function': re.compile(r'^pub\s+(?:async\s+)?fn\s+(\w+)'),
    'struct': re.compile(r'^pub\s+struct\s+(\w+)'),
    'enum': re.compile(r'^pub\s+enum\s+(\w+)'),
    'trait': re.compile(r'^pub\s+trait\s+(\w+)'),
    'type': re.compile(r'^pub\s+type\s+(\w+)'),
    'module': re.compile(r'^pub\s+mod\s+(\w+)'),
    'const': re.compile(r'^pub\s+(?:const|static)\s+(\w+)'),
}

# Pattern for doc comments with examples
DOC_WITH_EXAMPLE = re.compile(r'```rust[^`]*```', re.DOTALL)

def count_items_and_examples(content: str) -> dict:
    """Count public items and those with examples."""
    counts = defaultdict(lambda: {'total': 0, 'with_examples': 0})

    lines = content.split('\n')
    i = 0
    while i < len(lines):
        line = lines[i]

        # Check each pattern
        for item_type, pattern in PUB_PATTERNS.items():
            match = pattern.match(line)
            if match:
                counts[item_type]['total'] += 1

                # Look backwards for doc comments
                doc_lines = []
                j = i - 1
                while j >= 0 and (lines[j].strip().startswith('///') or
                                 lines[j].strip().startswith('//!') or
                                 not lines[j].strip()):
                    if lines[j].strip().startswith('///') or lines[j].strip().startswith('//!'):
                        doc_lines.append(lines[j])
                    j -= 1

                # Check for examples
                doc_text = '\n'.join(reversed(doc_lines))
                if DOC_WITH_EXAMPLE.search(doc_text):
                    counts[item_type]['with_examples'] += 1

                break
        i += 1

    return dict(counts)

def main():
    src_dir = Path('crates/pdftract-core/src')

    total_counts = defaultdict(lambda: {'total': 0, 'with_examples': 0})
    module_docs = []

    for rs_file in src_dir.rglob('*.rs'):
        content = rs_file.read_text()
        counts = count_items_and_examples(content)

        for item_type, counts_data in counts.items():
            for key in ['total', 'with_examples']:
                total_counts[item_type][key] += counts_data[key]

        # Track modules with doc comments
        if 'pub mod' in content or (rs_file.name == 'mod.rs' or rs_file.name == 'lib.rs'):
            has_module_doc = '//!' in content[:500]  # Check beginning of file
            module_name = rs_file.relative_to(src_dir)
            module_docs.append((str(module_name), has_module_doc))

    # Print results
    print("=" * 60)
    print("PDFTRACT-CORE RUSTDOC COVERAGE REPORT")
    print("=" * 60)
    print()

    total_items = sum(data['total'] for data in total_counts.values())
    total_with_examples = sum(data['with_examples'] for data in total_counts.values())
    coverage = (total_with_examples / total_items * 100) if total_items > 0 else 0

    print(f"Total public items: {total_items}")
    print(f"With examples: {total_with_examples}")
    print(f"Coverage: {coverage:.1f}%")
    print()

    print("By item type:")
    for item_type in ['function', 'struct', 'enum', 'trait', 'type', 'module', 'const']:
        if item_type in total_counts:
            data = total_counts[item_type]
            pct = (data['with_examples'] / data['total'] * 100) if data['total'] > 0 else 0
            print(f"  {item_type:10s}: {data['with_examples']:3d}/{data['total']:3d} ({pct:5.1f}%)")

    print()
    print("Modules with/without module-level docs (//!):")
    modules_without_doc = [name for name, has_doc in module_docs if not has_doc]
    print(f"  Modules checked: {len(module_docs)}")
    print(f"  Without module docs: {len(modules_without_doc)}")

    if modules_without_doc and len(modules_without_doc) <= 20:
        print("  Examples needing module docs:")
        for name in modules_without_doc[:10]:
            print(f"    - {name}")

    print()
    print("=" * 60)

    # Exit with error if coverage < 80%
    if coverage < 80:
        print(f"ERROR: Coverage {coverage:.1f}% is below 80% threshold")
        exit(1)
    else:
        print(f"SUCCESS: Coverage {coverage:.1f}% meets 80% threshold")
        exit(0)

if __name__ == '__main__':
    main()