pdftract/crates/pdftract-core/scripts/measure-public-api-coverage.py
2026-05-29 08:25:23 -04:00

243 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Measure rustdoc coverage for the actual public API (re-exported items only).
This focuses on items users can access via pdftract_core::, not internal pub items.
"""
import re
import subprocess
from pathlib import Path
from typing import Dict, List, Set
def get_public_api_items() -> Set[str]:
"""
Get the list of public API items by parsing rustdoc output.
These are items accessible via pdftract_core:: prefix.
"""
# Run cargo doc and capture the JSON output
result = subprocess.run(
['cargo', 'doc', '--no-deps', '-p', 'pdftract-core', '--open', '--no-deps'],
cwd=Path(__file__).parent.parent,
capture_output=True,
text=True,
timeout=300
)
# For now, parse lib.rs re-exports
lib_rs = Path(__file__).parent.parent / 'src' / 'lib.rs'
content = lib_rs.read_text()
items = set()
# Parse pub use statements
for line in content.split('\n'):
# Match: pub use module::{item1, item2, ...};
match = re.search(r'pub\s+use\s+(\w+)\s*::\s*\{([^}]+)\}', line)
if match:
module = match.group(1)
items_list = match.group(2)
for item in items_list.split(','):
item = item.strip()
if item and not item.startswith('_'):
items.add(f"{module}::{item}")
# Match: pub use module::item;
match = re.search(r'pub\s+use\s+(\w+)::(\w+)', line)
if match:
module = match.group(1)
item = match.group(2)
if not item.startswith('_'):
items.add(f"{module}::{item}")
# Parse module declarations (pub mod foo;)
for line in content.split('\n'):
match = re.search(r'pub\s+mod\s+(\w+)', line)
if match:
items.add(match.group(1))
return items
def check_item_has_example(item_path: str, src_dir: Path) -> bool:
"""Check if an item has a worked example in its documentation."""
# Convert item_path to file path
# e.g., "extract::extract_pdf" -> "src/extract.rs"
# or "document::Document" -> "src/document.rs"
parts = item_path.split('::')
if len(parts) < 2:
return False
module_name = parts[0]
item_name = parts[-1]
# Find the module file
module_file = src_dir / f"{module_name}.rs"
if not module_file.exists():
# Check if it's a mod directory
mod_dir = src_dir / module_name
if mod_dir.is_dir():
# Look for mod.rs or lib.rs in the directory
for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
if potential.exists():
module_file = potential
break
if not module_file.exists():
return False
content = module_file.read_text()
# Look for the item and check if it has a doc with example
# Simple regex search for the item declaration
pattern = rf'pub\s+(?:fn|struct|enum|trait|type|const)\s+{re.escape(item_name)}\b'
# Find the position of the item
match = re.search(pattern, content)
if not match:
return False
# Look backwards from the match for doc comments
pos = match.start()
doc_content = content[:pos]
# Check if there's a doc comment with an example
return '```rust' in doc_content or '```no_run' in doc_content
def main():
script_dir = Path(__file__).parent
src_dir = script_dir.parent / 'src'
# Get public API items from lib.rs re-exports
lib_rs = src_dir / 'lib.rs'
content = lib_rs.read_text()
public_items = []
for line in content.split('\n'):
# Parse pub use statements
matches = re.finditer(r'pub\s+use\s+([^;]+);', line)
for match in matches:
use_stmt = match.group(1)
# Handle "module::{items}" format
brace_match = re.search(r'(\w+)::\s*\{([^}]+)\}', use_stmt)
if brace_match:
module = brace_match.group(1)
items = brace_match.group(2)
for item in items.split(','):
item = item.strip()
if item and not item.startswith('_') and 'as' not in item:
public_items.append((module, item))
else:
# Handle "module::item" format
item_match = re.search(r'(\w+)::(\w+)', use_stmt)
if item_match:
module = item_match.group(1)
item = item_match.group(2)
if not item.startswith('_'):
public_items.append((module, item))
# Also count pub mod declarations
for line in content.split('\n'):
matches = re.finditer(r'pub\s+mod\s+(\w+)', line)
for match in matches:
public_items.append((match.group(1), '<module>'))
print(f"Found {len(public_items)} public API items (re-exports)")
# Check which ones have examples
with_examples = 0
with_docs = 0
items_without = []
for module, item in public_items:
if item == '<module>':
# Module-level docs
module_file = src_dir / f"{module}.rs"
if not module_file.exists():
mod_dir = src_dir / module
if mod_dir.is_dir():
for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
if potential.exists():
module_file = potential
break
if module_file.exists():
content = module_file.read_text()
has_doc = content.lstrip().startswith('//!')
has_example = '```rust' in content[:500] or '```no_run' in content[:500]
if has_doc:
with_docs += 1
if has_example:
with_examples += 1
else:
items_without.append((module, item, has_doc))
else:
# Item-level docs
has_ex, has_doc = check_item_for_docs(module, item, src_dir)
if has_doc:
with_docs += 1
if has_ex:
with_examples += 1
else:
items_without.append((module, item, has_doc))
total = len(public_items)
coverage = (with_examples / total * 100) if total > 0 else 0
doc_coverage = (with_docs / total * 100) if total > 0 else 0
print(f"\n{'='*50}")
print(f"Public API Rustdoc Coverage")
print(f"{'='*50}")
print(f"Total public API items: {total}")
print(f"With documentation: {with_docs} ({doc_coverage:.1f}%)")
print(f"With worked examples: {with_examples} ({coverage:.1f}%)")
print(f"\nTarget: 80% example coverage")
print(f"Status: {'✓ PASS' if coverage >= 80 else '✗ FAIL'}")
if items_without:
print(f"\n--- Items lacking examples ({len(items_without)}) ---")
for module, item, has_doc in items_without[:20]:
doc_marker = '📄' if has_doc else ''
print(f" {doc_marker} {module}::{item}")
if len(items_without) > 20:
print(f" ... and {len(items_without) - 20} more")
return 0 if coverage >= 80 else 1
def check_item_for_docs(module: str, item: str, src_dir: Path) -> tuple:
"""Check if an item has documentation and/or examples."""
# Find the module file
module_file = src_dir / f"{module}.rs"
if not module_file.exists():
mod_dir = src_dir / module
if mod_dir.is_dir():
for potential in [mod_dir / 'mod.rs', mod_dir / 'lib.rs']:
if potential.exists():
module_file = potential
break
if not module_file.exists():
return False, False
content = module_file.read_text()
# Look for the item
patterns = [
rf'pub\s+fn\s+{re.escape(item)}\b',
rf'pub\s+struct\s+{re.escape(item)}\b',
rf'pub\s+enum\s+{re.escape(item)}\b',
rf'pub\s+trait\s+{re.escape(item)}\b',
rf'pub\s+type\s+{re.escape(item)}\b',
rf'impl\s+(?:<[^>]*>\s+)?{re.escape(item)}\s*\{{[^}}]*\bpub\s+fn\s+(\w+)',
]
for pattern in patterns:
match = re.search(pattern, content)
if match:
pos = match.start()
doc_content = content[:pos]
has_doc = '///' in doc_content or '/**' in doc_content
has_example = '```rust' in doc_content or '```no_run' in doc_content
return has_example, has_doc
return False, False
if __name__ == '__main__':
exit(main())