pdftract/audit_docs.py

#!/usr/bin/env python3
"""
Audit script to find public items in pdftract-core that are missing documentation.
"""
import re
import subprocess
from pathlib import Path
from collections import defaultdict

PUBLIC_PATTERNS = [
    (r'pub fn (\w+)', 'function'),
    (r'pub struct (\w+)', 'struct'),
    (r'pub enum (\w+)', 'enum'),
    (r'pub trait (\w+)', 'trait'),
    (r'pub type (\w+)', 'type'),
    (r'pub const (\w+)', 'const'),
    (r'pub mod (\w+)', 'module'),
    (r'pub (?:static|async) (\w+)', 'other'),
]

def has_doc_comment(lines, line_idx):
    """Check if there's a doc comment before the given line."""
    for i in range(line_idx - 1, -1, -1):
        line = lines[i].strip()
        if line.startswith('///') or line.startswith('//!'):
            return True
        if line and not line.startswith('//') and not line.startswith('#'):
            break
    return False

def audit_file(filepath):
    """Audit a single Rust file for missing documentation."""
    items = []
    lines = filepath.read_text(encoding='utf-8').split('\n')

    for line_idx, line in enumerate(lines):
        for pattern, item_type in PUBLIC_PATTERNS:
            match = re.search(pattern, line)
            if match:
                item_name = match.group(1)
                has_docs = has_doc_comment(lines, line_idx)
                items.append({
                    'name': item_name,
                    'type': item_type,
                    'has_docs': has_docs,
                    'line': line_idx + 1,
                    'file': str(filepath.relative_to('/home/coding/pdftract/crates/pdftract-core/src'))
                })
    return items

def main():
    src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')

    all_items = []
    for rs_file in sorted(src_dir.rglob('*.rs')):
        all_items.extend(audit_file(rs_file))

    # Group by type and coverage
    by_type = defaultdict(lambda: {'total': 0, 'with_docs': 0, 'missing': []})
    for item in all_items:
        by_type[item['type']]['total'] += 1
        if item['has_docs']:
            by_type[item['type']]['with_docs'] += 1
        else:
            by_type[item['type']]['missing'].append(item)

    # Print summary
    print("=" * 60)
    print("PDFTRACT-CORE DOCUMENTATION AUDIT")
    print("=" * 60)
    print()

    total_items = len(all_items)
    total_with_docs = sum(1 for i in all_items if i['has_docs'])

    print(f"TOTAL PUBLIC ITEMS: {total_items}")
    print(f"WITH DOCUMENTATION: {total_with_docs} ({100 * total_with_docs / total_items:.1f}%)")
    print(f"MISSING DOCUMENTATION: {total_items - total_with_docs} ({100 * (total_items - total_with_docs) / total_items:.1f}%)")
    print()

    print("BY TYPE:")
    print("-" * 40)
    for item_type, data in sorted(by_type.items()):
        coverage = 100 * data['with_docs'] / data['total'] if data['total'] > 0 else 0
        print(f"{item_type:12}: {data['with_docs']:4}/{data['total']:<4} ({coverage:5.1f}%)")
    print()

    # Print top missing items
    if any(by_type[t]['missing'] for t in by_type):
        print("TOP ITEMS MISSING DOCS (first 20 by type):")
        print("-" * 40)
        for item_type in sorted(by_type.keys()):
            missing = by_type[item_type]['missing'][:10]
            for item in missing:
                print(f"  [{item_type}] {item['name']} at {item['file']}:{item['line']}")

    print()
    print("=" * 60)

    # Return exit code based on 80% threshold
    coverage = 100 * total_with_docs / total_items if total_items > 0 else 0
    if coverage >= 80:
        print(f"✓ PASS: {coverage:.1f}% coverage meets 80% threshold")
        return 0
    else:
        print(f"✗ FAIL: {coverage:.1f}% coverage below 80% threshold")
        return 1

if __name__ == '__main__':
    exit(main())