pdftract/scripts/measure-public-api-coverage.py
2026-05-29 08:25:23 -04:00

158 lines
6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Measure rustdoc coverage for pdftract-core public API.
Counts public items and tracks which have doc comments with examples.
"""
import os
import re
from pathlib import Path
from dataclasses import dataclass
from typing import List, Set, Dict
@dataclass
class DocStats:
"""Statistics for documentation coverage."""
total_items: int = 0
documented_items: int = 0
with_examples: int = 0
items_with_examples: List[str] = None
def __post_init__(self):
if self.items_with_examples is None:
self.items_with_examples = []
def extract_rust_items(content: str, filename: str) -> List[tuple]:
"""
Extract public items from Rust source code.
Returns list of (item_type, name, line_number, has_doc, has_example) tuples.
"""
items = []
lines = content.split('\n')
i = 0
in_doc_block = False
doc_lines = []
# Patterns for public items
patterns = {
'pub fn': re.compile(r'pub\s+(?:async\s+)?fn\s+(\w+)'),
'pub struct': re.compile(r'pub\s+struct\s+(\w+)'),
'pub enum': re.compile(r'pub\s+enum\s+(\w+)'),
'pub trait': re.compile(r'pub\s+trait\s+(\w+)'),
'pub const': re.compile(r'pub\s+const\s+(\w+)'),
'pub type': re.compile(r'pub\s+type\s+(\w+)'),
'pub mod': re.compile(r'pub\s+mod\s+(\w+)'),
'impl': re.compile(r'impl\s+(\w+)'), # For trait impls
}
while i < len(lines):
line = lines[i].strip()
# Track doc comments
if line.startswith('///') or line.startswith('//!'):
in_doc_block = True
doc_lines.append(line)
elif line.startswith('/*!') or line.startswith('/**!'):
# Block doc start
in_doc_block = True
doc_lines.append(line)
elif in_doc_block and (line.startswith('*/') or line.startswith('/*!') or line.startswith('/**!')):
# End of block doc
doc_lines.append(line)
elif in_doc_block and not (line.startswith('/*') or line.startswith('*') or not line):
# Still in doc block or continuation
if line.startswith('*') or line.startswith('/*') or line.startswith('*/'):
doc_lines.append(line)
else:
in_doc_block = False
else:
# Check for public items
for item_type, pattern in patterns.items():
match = pattern.search(line)
if match:
name = match.group(1)
has_doc = len(doc_lines) > 0
has_example = any('```' in dl for dl in doc_lines)
# Only count if it's actually public (not `pub(crate)` etc)
if 'pub(' not in lines[i][max(0, lines[i].find('pub')-10):lines[i].find('pub')+20]:
items.append((item_type, name, i + 1, has_doc, has_example, filename))
doc_lines = []
break
else:
# No match found, reset doc tracking
if not line.startswith('*') and not line.startswith('/*') and line and not line.startswith('//'):
doc_lines = []
in_doc_block = False
i += 1
return items
def scan_directory(src_dir: Path) -> Dict[str, DocStats]:
"""Scan all Rust files in src directory."""
all_items = []
for rs_file in src_dir.rglob('*.rs'):
if 'tests' in str(rs_file) or 'examples' in str(rs_file):
continue
content = rs_file.read_text(encoding='utf-8', errors='ignore')
items = extract_rust_items(content, str(rs_file))
all_items.extend(items)
stats = DocStats()
stats.total_items = len(all_items)
stats.documented_items = sum(1 for item in all_items if item[3])
stats.with_examples = sum(1 for item in all_items if item[4])
stats.items_with_examples = [f"{item[0]} {item[1]} ({item[5]}:{item[2]})" for item in all_items if item[4]]
return stats, all_items
def main():
src_dir = Path('crates/pdftract-core/src')
print("Scanning pdftract-core for public API items...")
stats, all_items = scan_directory(src_dir)
print(f"\n=== Documentation Coverage Report ===")
print(f"Total public items: {stats.total_items}")
print(f"Documented items: {stats.documented_items} ({stats.documented_items/max(1,stats.total_items)*100:.1f}%)")
print(f"With examples: {stats.with_examples} ({stats.with_examples/max(1,stats.total_items)*100:.1f}%)")
print(f"\nTarget: 80% coverage")
print(f"Current: {stats.with_examples/max(1,stats.total_items)*100:.1f}%")
print(f"Gap: {max(0, 0.8 * stats.total_items - stats.with_examples):.0f} items need examples")
# Show items by type
from collections import defaultdict
by_type = defaultdict(list)
for item in all_items:
by_type[item[0]].append(item)
print(f"\n=== Breakdown by type ===")
for item_type, items in sorted(by_type.items()):
total = len(items)
with_ex = sum(1 for i in items if i[4])
print(f"{item_type}: {with_ex}/{total} ({with_ex/max(1,total)*100:.0f}%)")
# Show undocumented items
undocumented = [item for item in all_items if not item[3]]
if undocumented:
print(f"\n=== Undocumented items ({len(undocumented)}) ===")
for item in sorted(undocumented, key=lambda x: (x[5], x[2]))[:50]:
print(f" {item[0]} {item[1]} at {item[5]}:{item[2]}")
if len(undocumented) > 50:
print(f" ... and {len(undocumented) - 50} more")
# Show documented without examples
doc_no_ex = [item for item in all_items if item[3] and not item[4]]
if doc_no_ex:
print(f"\n=== Documented but without examples ({len(doc_no_ex)}) ===")
for item in sorted(doc_no_ex, key=lambda x: (x[5], x[2]))[:50]:
print(f" {item[0]} {item[1]} at {item[5]}:{item[2]}")
if len(doc_no_ex) > 50:
print(f" ... and {len(doc_no_ex) - 50} more")
if __name__ == '__main__':
main()