pdftract/crates/pdftract-core/scripts/measure-public-api-coverage.py

#!/usr/bin/env python3
"""
Measure rustdoc coverage for the actual public API (re-exported items only).

This focuses on items users can access via pdftract_core::, not internal pub items.
"""
import re
import subprocess
from pathlib import Path
from typing import Dict, List, Set

def get_public_api_items() -> Set[str]:
    """
    Get the list of public API items by parsing rustdoc output.
    These are items accessible via pdftract_core:: prefix.
    """
    # Run cargo doc and capture the JSON output
    result = subprocess.run(
        ['cargo', 'doc', '--no-deps', '-p', 'pdftract-core', '--open', '--no-deps'],
        cwd=Path(__file__).parent.parent,
        capture_output=True,
        text=True,
        timeout=300
    )
    # For now, parse lib.rs re-exports
    lib_rs = Path(__file__).parent.parent / 'src' / 'lib.rs'
    content = lib_rs.read_text()

    items = set()

    # Parse pub use statements
    for line in content.split('\n'):
        # Match: pub use module::{item1, item2, ...};
        match = re.search(r'pub\s+use\s+(\w+)\s*::\s*\{([^}]+)\}', line)
        if match:
            module = match.group(1)
            items_list = match.group(2)
            for item in items_list.split(','):
                item = item.strip()
                if item and not item.startswith('_'):
                    items.add(f"{module}::{item}")

        # Match: pub use module::item;
        match = re.search(r'pub\s+use\s+(\w+)::(\w+)', line)
        if match:
            module = match.group(1)
            item = match.group(2)
            if not item.startswith('_'):
                items.add(f"{module}::{item}")

    # Parse module declarations (pub mod foo;)
    for line in content.split('\n'):
        match = re.search(r'pub\s+mod\s+(\w+)', line)
        if match:
            items.add(match.group(1))

    return items

def check_item_has_example(item_path: str, src_dir: Path) -> bool:
    """Check if an item has a worked example in its documentation."""
    # Convert item_path to file path
    # e.g., "extract::extract_pdf" -> "src/extract.rs"
    # or "document::Document" -> "src/document.rs"

    parts = item_path.split('::')
    if len(parts) < 2:
        return False

    module_name = parts[0]
    item_name = parts[-1]

    # Find the module file
    module_file = src_dir / f"{module_name}.rs"
    if not module_file.exists():
        # Check if it's a mod directory
        mod_dir = src_dir / module_name
        if mod_dir.is_dir():
            # Look for mod.rs or lib.rs in the directory
            for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
                if potential.exists():
                    module_file = potential
                    break

    if not module_file.exists():
        return False

    content = module_file.read_text()

    # Look for the item and check if it has a doc with example
    # Simple regex search for the item declaration
    pattern = rf'pub\s+(?:fn|struct|enum|trait|type|const)\s+{re.escape(item_name)}\b'

    # Find the position of the item
    match = re.search(pattern, content)
    if not match:
        return False

    # Look backwards from the match for doc comments
    pos = match.start()
    doc_content = content[:pos]

    # Check if there's a doc comment with an example
    return '```rust' in doc_content or '```no_run' in doc_content

def main():
    script_dir = Path(__file__).parent
    src_dir = script_dir.parent / 'src'

    # Get public API items from lib.rs re-exports
    lib_rs = src_dir / 'lib.rs'
    content = lib_rs.read_text()

    public_items = []
    for line in content.split('\n'):
        # Parse pub use statements
        matches = re.finditer(r'pub\s+use\s+([^;]+);', line)
        for match in matches:
            use_stmt = match.group(1)
            # Handle "module::{items}" format
            brace_match = re.search(r'(\w+)::\s*\{([^}]+)\}', use_stmt)
            if brace_match:
                module = brace_match.group(1)
                items = brace_match.group(2)
                for item in items.split(','):
                    item = item.strip()
                    if item and not item.startswith('_') and 'as' not in item:
                        public_items.append((module, item))
            else:
                # Handle "module::item" format
                item_match = re.search(r'(\w+)::(\w+)', use_stmt)
                if item_match:
                    module = item_match.group(1)
                    item = item_match.group(2)
                    if not item.startswith('_'):
                        public_items.append((module, item))

    # Also count pub mod declarations
    for line in content.split('\n'):
        matches = re.finditer(r'pub\s+mod\s+(\w+)', line)
        for match in matches:
            public_items.append((match.group(1), '<module>'))

    print(f"Found {len(public_items)} public API items (re-exports)")

    # Check which ones have examples
    with_examples = 0
    with_docs = 0
    items_without = []

    for module, item in public_items:
        if item == '<module>':
            # Module-level docs
            module_file = src_dir / f"{module}.rs"
            if not module_file.exists():
                mod_dir = src_dir / module
                if mod_dir.is_dir():
                    for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
                        if potential.exists():
                            module_file = potential
                            break
            if module_file.exists():
                content = module_file.read_text()
                has_doc = content.lstrip().startswith('//!')
                has_example = '```rust' in content[:500] or '```no_run' in content[:500]
                if has_doc:
                    with_docs += 1
                if has_example:
                    with_examples += 1
                else:
                    items_without.append((module, item, has_doc))
        else:
            # Item-level docs
            has_ex, has_doc = check_item_for_docs(module, item, src_dir)
            if has_doc:
                with_docs += 1
            if has_ex:
                with_examples += 1
            else:
                items_without.append((module, item, has_doc))

    total = len(public_items)
    coverage = (with_examples / total * 100) if total > 0 else 0
    doc_coverage = (with_docs / total * 100) if total > 0 else 0

    print(f"\n{'='*50}")
    print(f"Public API Rustdoc Coverage")
    print(f"{'='*50}")
    print(f"Total public API items: {total}")
    print(f"With documentation: {with_docs} ({doc_coverage:.1f}%)")
    print(f"With worked examples: {with_examples} ({coverage:.1f}%)")
    print(f"\nTarget: 80% example coverage")
    print(f"Status: {'✓ PASS' if coverage >= 80 else '✗ FAIL'}")

    if items_without:
        print(f"\n--- Items lacking examples ({len(items_without)}) ---")
        for module, item, has_doc in items_without[:20]:
            doc_marker = '📄' if has_doc else '❌'
            print(f"  {doc_marker} {module}::{item}")
        if len(items_without) > 20:
            print(f"  ... and {len(items_without) - 20} more")

    return 0 if coverage >= 80 else 1

def check_item_for_docs(module: str, item: str, src_dir: Path) -> tuple:
    """Check if an item has documentation and/or examples."""
    # Find the module file
    module_file = src_dir / f"{module}.rs"
    if not module_file.exists():
        mod_dir = src_dir / module
        if mod_dir.is_dir():
            for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
                if potential.exists():
                    module_file = potential
                    break

    if not module_file.exists():
        return False, False

    content = module_file.read_text()

    # Look for the item
    patterns = [
        rf'pub\s+fn\s+{re.escape(item)}\b',
        rf'pub\s+struct\s+{re.escape(item)}\b',
        rf'pub\s+enum\s+{re.escape(item)}\b',
        rf'pub\s+trait\s+{re.escape(item)}\b',
        rf'pub\s+type\s+{re.escape(item)}\b',
        rf'impl\s+(?:<[^>]*>\s+)?{re.escape(item)}\s*\{{[^}}]*\bpub\s+fn\s+(\w+)',
    ]

    for pattern in patterns:
        match = re.search(pattern, content)
        if match:
            pos = match.start()
            doc_content = content[:pos]
            has_doc = '///' in doc_content or '/**' in doc_content
            has_example = '```rust' in doc_content or '```no_run' in doc_content
            return has_example, has_doc

    return False, False

if __name__ == '__main__':
    exit(main())