pdftract/tools/count_docs.py

#!/usr/bin/env python3
"""Count rustdoc coverage for pdftract-core."""

import os
import re
from pathlib import Path
from collections import defaultdict

CORE_DIR = Path("crates/pdftract-core/src")

# Patterns for public items
PUB_PATTERNS = {
    "fn": re.compile(r'^pub (?:async\s+)?fn\s+(\w+)'),
    "struct": re.compile(r'^pub struct\s+(\w+)'),
    "enum": re.compile(r'^pub enum\s+(\w+)'),
    "trait": re.compile(r'^pub trait\s+(\w+)'),
    "type": re.compile(r'^pub type\s+(\w+)'),
    "mod": re.compile(r'^pub mod\s+(\w+)'),
    "const": re.compile(r'^pub const\s+(\w+)'),
    "static": re.compile(r'^pub static\s+(\w+)'),
}

# Pattern for code blocks in doc comments
EXAMPLE_PATTERN = re.compile(r'```rust[^`]*```')
DOC_COMMENT_PATTERN = re.compile(r'///.*|//!.*')

def count_public_items_and_examples(file_path: Path):
    """Count public items and examples in a single file."""
    with open(file_path) as f:
        lines = f.readlines()

    pub_items = []
    i = 0
    while i < len(lines):
        line = lines[i]

        # Look for public items
        for item_type, pattern in PUB_PATTERNS.items():
            match = pattern.match(line.strip())
            if match:
                item_name = match.group(1)
                pub_items.append({
                    "type": item_type,
                    "name": item_name,
                    "line": i + 1,
                    "has_example": False
                })
                break
        i += 1

    # Now check each pub item for examples
    # This is simplified - we need to scan doc comments before each item
    for item in pub_items:
        line_idx = item["line"] - 1
        # Scan backwards for doc comments
        doc_lines = []
        j = line_idx - 1
        while j >= 0 and (lines[j].strip().startswith("///") or lines[j].strip().startswith("//!")):
            doc_lines.insert(0, lines[j])
            j -= 1

        # Check if any doc comment contains a code block
        doc_text = "".join(doc_lines)
        if EXAMPLE_PATTERN.search(doc_text):
            item["has_example"] = True

    return pub_items


def main():
    all_items = []
    for rs_file in CORE_DIR.rglob("*.rs"):
        # Skip lib.rs top-level module exports
        if rs_file.name == "lib.rs":
            continue

        items = count_public_items_and_examples(rs_file)
        all_items.extend(items)

    total = len(all_items)
    with_examples = sum(1 for item in all_items if item["has_example"])
    coverage = (with_examples / total * 100) if total > 0 else 0

    print(f"Total public items: {total}")
    print(f"With worked examples: {with_examples}")
    print(f"Coverage: {coverage:.1f}%")

    # Breakdown by type
    by_type = defaultdict(list)
    for item in all_items:
        by_type[item["type"]].append(item)

    print("\nBy type:")
    for item_type, items in sorted(by_type.items()):
        with_ex = sum(1 for i in items if i["has_example"])
        print(f"  {item_type}: {with_ex}/{len(items)} ({with_ex/len(items)*100:.1f}%)")


if __name__ == "__main__":
    main()