pdftract/scripts/doc_example_coverage.py

#!/usr/bin/env python3
"""
Measure rustdoc example coverage for pdftract-core.

Counts public items and determines how many have at least one worked example.
"""

import os
import re
import subprocess
import json
from pathlib import Path
from collections import defaultdict

# Patterns to detect public items
PUBLIC_PATTERNS = {
    'fn': re.compile(r'pub\s+(?:async\s+)?fn\s+(\w+)'),
    'struct': re.compile(r'pub\s+struct\s+(\w+)'),
    'enum': re.compile(r'pub\s+enum\s+(\w+)'),
    'trait': re.compile(r'pub\s+trait\s+(\w+)'),
    'mod': re.compile(r'pub\s+mod\s+(\w+)'),
    'type': re.compile(r'pub\s+type\s+(\w+)'),
    'const': re.compile(r'pub\s+(?:const|static)\s+(\w+)'),
    'impl': re.compile(r'impl\s+(?:<[^>]*>)?\s*(\w+)\s*(?:<[^>]*>)?\s*\{'),  # For trait impls that add methods
}

# Pattern to detect doc code blocks
EXAMPLE_PATTERN = re.compile(r'```rust[^`]*```', re.MULTILINE)
DOC_COMMENT_PATTERN = re.compile(r'///[^\n]*|//![^\n]*')

def has_item_with_examples(content, item_name, item_type):
    """Check if a public item has at least one worked example."""
    # Look for the item and its associated doc comments
    # This is a simplified check - we look for doc comments with code blocks
    # near the item declaration

    # Split by item and look for doc comments immediately before
    lines = content.split('\n')
    item_line = None
    for i, line in enumerate(lines):
        if item_name in line and any(f'pub {t}' in line for t in ['fn', 'struct', 'enum', 'trait', 'mod', 'type', 'const', 'static']):
            item_line = i
            break

    if item_line is None:
        return False

    # Look backwards for doc comments
    doc_lines = []
    for i in range(item_line - 1, max(0, item_line - 50), -1):
        line = lines[i].strip()
        if line.startswith('///') or line.startswith('//!'):
            doc_lines.insert(0, line)
        elif line and not line.startswith('//') and not line.startswith('#['):
            # Stop at non-comment, non-attribute line
            break

    doc_content = '\n'.join(doc_lines)

    # Check for code blocks
    return bool(EXAMPLE_PATTERN.search(doc_content))

def find_public_items_in_file(filepath):
    """Find all public items in a Rust source file."""
    content = filepath.read_text()

    items = []
    for item_type, pattern in PUBLIC_PATTERNS.items():
        for match in pattern.finditer(content):
            item_name = match.group(1)
            # Skip common non-public items
            if item_name.startswith('_'):
                continue
            items.append((item_type, item_name, match.start()))

    return items, content

def scan_crate(src_path):
    """Scan the crate for public items and example coverage."""
    src_path = Path(src_path)
    results = {
        'total_items': 0,
        'items_with_examples': 0,
        'by_type': defaultdict(lambda: {'total': 0, 'with_examples': 0}),
        'files': {}
    }

    # Get all .rs files
    rs_files = list(src_path.rglob('*.rs'))

    for rs_file in rs_files:
        # Skip build.rs and tests
        if 'build.rs' in str(rs_file) or 'tests/' in str(rs_file):
            continue

        try:
            items, content = find_public_items_in_file(rs_file)

            if items:
                file_results = {
                    'total': len(items),
                    'with_examples': 0,
                    'items': []
                }

                for item_type, item_name, _ in items:
                    results['total_items'] += 1
                    results['by_type'][item_type]['total'] += 1
                    file_results['total'] += 1

                    has_examples = has_item_with_examples(content, item_name, item_type)

                    file_results['items'].append({
                        'name': item_name,
                        'type': item_type,
                        'has_examples': has_examples
                    })

                    if has_examples:
                        results['items_with_examples'] += 1
                        results['by_type'][item_type]['with_examples'] += 1
                        file_results['with_examples'] += 1

                results['files'][str(rs_file.relative_to(src_path.parent.parent))] = file_results
        except Exception as e:
            print(f"Error processing {rs_file}: {e}", flush=True)

    return results

def main():
    pdftract_core = Path('/home/coding/pdftract/crates/pdftract-core/src')
    results = scan_crate(pdftract_core)

    coverage = (results['items_with_examples'] / results['total_items'] * 100) if results['total_items'] > 0 else 0

    print("=" * 60)
    print(f"Rustdoc Example Coverage Report for pdftract-core")
    print("=" * 60)
    print(f"\nTotal public items: {results['total_items']}")
    print(f"Items with examples: {results['items_with_examples']}")
    print(f"Coverage: {coverage:.1f}%")
    print(f"\nTarget: 80%")
    print(f"Status: {'✓ PASS' if coverage >= 80 else '✗ FAIL'}")

    print("\n" + "=" * 60)
    print("Coverage by Type")
    print("=" * 60)
    for item_type, counts in sorted(results['by_type'].items()):
        total = counts['total']
        with_ex = counts['with_examples']
        cov = (with_ex / total * 100) if total > 0 else 0
        print(f"{item_type:12} {with_ex:4}/{total:4} ({cov:5.1f}%) {'✓' if cov >= 80 else '✗'}")

    # Show files that need work
    print("\n" + "=" * 60)
    print("Files Needing Examples (showing items without examples)")
    print("=" * 60)

    for file_path, file_results in sorted(results['files'].items()):
        file_cov = (file_results['with_examples'] / file_results['total'] * 100) if file_results['total'] > 0 else 0
        missing = [item for item in file_results['items'] if not item['has_examples']]
        if missing and file_cov < 80:
            print(f"\n{file_path} ({file_cov:.0f}% coverage)")
            for item in sorted(missing, key=lambda x: (x['type'], x['name'])):
                print(f"  - {item['type']:8} {item['name']}")

    print("\n" + "=" * 60)

    # Output JSON for scripts
    output_json = {
        'coverage': coverage,
        'total_items': results['total_items'],
        'items_with_examples': results['items_with_examples'],
        'pass': coverage >= 80
    }

    json_path = Path('/tmp/doc_example_coverage.json')
    json_path.write_text(json.dumps(output_json, indent=2))

    return 0 if coverage >= 80 else 1

if __name__ == '__main__':
    exit(main())