pdftract/crates/pdftract-core/doc_coverage.py
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

244 lines
8.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Measure rustdoc coverage for pdftract-core.
This script scans all .rs files and counts:
- Public items (pub fn/struct/enum/trait/type/mod/const)
- Items with documentation (/// or /*!)
- Items with worked examples (```rust blocks in doc comments)
"""
import os
import re
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List
@dataclass
class FileStats:
"""Statistics for a single source file."""
path: str
pub_items: int
with_doc: int
with_example: int
items: List[Dict]
def extract_public_items(content: str, filepath: str) -> List[Dict]:
"""Extract public items from Rust source code.
Returns a list of dicts with keys: kind, name, has_doc, has_example, line
"""
items = []
lines = content.split('\n')
# Patterns for public items
patterns = [
(r'pub\s+(?:async\s+)?fn\s+(\w+)', 'fn'),
(r'pub\s+struct\s+(\w+)', 'struct'),
(r'pub\s+enum\s+(\w+)', 'enum'),
(r'pub\s+trait\s+(\w+)', 'trait'),
(r'pub\s+type\s+(\w+)', 'type'),
(r'pub\s+mod\s+(\w+)', 'mod'),
(r'pub\s+(?:const|static)\s+(\w+)', 'const'),
(r'pub\s+use\s+(?:(\w+)|.*\s+as\s+(\w+))', 'use'), # pub use X as Y
(r'impl\s+(\w+)\s*\{', 'impl'), # impl blocks (inherent impls)
]
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Skip lines that are just comments or empty
if stripped.startswith('//') or not stripped:
i += 1
continue
# Check if this line declares a public item
matched = False
for pattern, kind in patterns:
match = re.search(pattern, line)
if match:
# Get the name (handle both groups for pub use case)
name = match.group(1) or match.group(2) if match.lastindex >= 2 else match.group(1)
if name:
# Look back for documentation comments
has_doc = False
has_example = False
doc_lines = []
j = i - 1
while j >= 0:
prev_line = lines[j].strip()
if prev_line.startswith('///') or prev_line.startswith('//!'):
has_doc = True
doc_lines.insert(0, prev_line[3:])
# Check for example blocks
if '```' in prev_line:
has_example = True
elif prev_line.startswith('/**') or prev_line.startswith('/*!'):
has_doc = True
# Multi-line comment - scan forward
k = j
while k < len(lines):
curr = lines[k].strip()
if '```' in curr:
has_example = True
if curr.endswith('*/') or curr.endswith('*/)'):
break
k += 1
break
elif prev_line and not prev_line.startswith('//'):
# Non-comment, non-empty line - stop looking back
break
j -= 1
items.append({
'kind': kind,
'name': name,
'line': i + 1,
'has_doc': has_doc,
'has_example': has_example,
'doc_lines': doc_lines
})
matched = True
break
# Special handling for re-exports that span multiple lines
if not matched and 'pub use' in line:
# This might be a multi-line pub use - skip for now
pass
i += 1
return items
def scan_directory(src_dir: Path) -> Dict[str, FileStats]:
"""Scan all .rs files in the source directory."""
stats = {}
for rs_file in src_dir.rglob('*.rs'):
# Skip tests and benchmarks directories
if 'tests' in rs_file.parts or 'benches' in rs_file.parts:
continue
try:
with open(rs_file, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
except Exception as e:
print(f"Warning: Could not read {rs_file}: {e}")
continue
relative_path = rs_file.relative_to(src_dir.parent)
items = extract_public_items(content, str(rs_file))
if items:
with_doc = sum(1 for it in items if it['has_doc'])
with_example = sum(1 for it in items if it['has_example'])
stats[str(relative_path)] = FileStats(
path=str(relative_path),
pub_items=len(items),
with_doc=with_doc,
with_example=with_example,
items=items
)
return stats
def print_summary(stats: Dict[str, FileStats]):
"""Print summary statistics."""
total_items = sum(s.pub_items for s in stats.values())
total_with_doc = sum(s.with_doc for s in stats.values())
total_with_example = sum(s.with_example for s in stats.values())
doc_coverage = (total_with_doc / total_items * 100) if total_items > 0 else 0
example_coverage = (total_with_example / total_items * 100) if total_items > 0 else 0
print("=" * 70)
print("RUSTDOC COVERAGE SUMMARY")
print("=" * 70)
print(f"\nTotal public items: {total_items}")
print(f"With documentation: {total_with_doc} ({doc_coverage:.1f}%)")
print(f"With examples: {total_with_example} ({example_coverage:.1f}%)")
print()
# Files with low example coverage
print("Files with lowest example coverage (top 10):")
print("-" * 70)
sorted_files = sorted(
stats.items(),
key=lambda x: (x[1].pub_items - x[1].with_example) if x[1].pub_items > 0 else 0,
reverse=True
)
for i, (path, stat) in enumerate(sorted_files[:10]):
if stat.pub_items > 0:
cov = (stat.with_example / stat.pub_items * 100) if stat.pub_items > 0 else 0
print(f"{i+1:2d}. {path:50s} {stat.with_example:3d}/{stat.pub_items:3d} ({cov:5.1f}%)")
print()
# Files lacking documentation entirely
no_doc_files = [(p, s) for p, s in stats.items() if s.with_doc == 0 and s.pub_items > 0]
if no_doc_files:
print("Files with NO documentation:")
print("-" * 70)
for path, stat in no_doc_files[:10]:
print(f" {path}: {stat.pub_items} undocumented items")
print()
# Specific items without documentation
undocumented = []
for path, stat in stats.items():
for item in stat.items:
if not item['has_doc']:
undocumented.append((path, item))
if undocumented:
print(f"Undocumented items (showing first 20 of {len(undocumented)}):")
print("-" * 70)
for i, (path, item) in enumerate(undocumented[:20]):
print(f"{i+1:2d}. {path:45s} {item['kind']:8s} {item['name']}")
print()
# Items without examples
no_example = []
for path, stat in stats.items():
for item in stat.items:
if not item['has_example'] and item['kind'] in ('fn', 'struct', 'enum', 'trait'):
no_example.append((path, item))
if no_example:
print(f"Items without examples (showing first 30 of {len(no_example)}):")
print("-" * 70)
for i, (path, item) in enumerate(no_example[:30]):
print(f"{i+1:2d}. {path:45s} {item['kind']:8s} {item['name']}")
print()
def main():
src_dir = Path(__file__).parent / 'src'
if not src_dir.exists():
print(f"Error: Source directory not found: {src_dir}")
return 1
print(f"Scanning {src_dir}...")
stats = scan_directory(src_dir)
print_summary(stats)
# Return non-zero if example coverage < 80%
total_items = sum(s.pub_items for s in stats.values())
total_with_example = sum(s.with_example for s in stats.values())
coverage = (total_with_example / total_items * 100) if total_items > 0 else 0
print("=" * 70)
if coverage >= 80:
print(f"✓ PASS: Example coverage {coverage:.1f}% >= 80%")
return 0
else:
print(f"✗ FAIL: Example coverage {coverage:.1f}% < 80%")
return 1
if __name__ == '__main__':
exit(main())