pdftract/scripts/doc_example_coverage.py
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

183 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Measure rustdoc example coverage for pdftract-core.
Counts public items and determines how many have at least one worked example.
"""
import os
import re
import subprocess
import json
from pathlib import Path
from collections import defaultdict
# Patterns to detect public items
PUBLIC_PATTERNS = {
'fn': re.compile(r'pub\s+(?:async\s+)?fn\s+(\w+)'),
'struct': re.compile(r'pub\s+struct\s+(\w+)'),
'enum': re.compile(r'pub\s+enum\s+(\w+)'),
'trait': re.compile(r'pub\s+trait\s+(\w+)'),
'mod': re.compile(r'pub\s+mod\s+(\w+)'),
'type': re.compile(r'pub\s+type\s+(\w+)'),
'const': re.compile(r'pub\s+(?:const|static)\s+(\w+)'),
'impl': re.compile(r'impl\s+(?:<[^>]*>)?\s*(\w+)\s*(?:<[^>]*>)?\s*\{'), # For trait impls that add methods
}
# Pattern to detect doc code blocks
EXAMPLE_PATTERN = re.compile(r'```rust[^`]*```', re.MULTILINE)
DOC_COMMENT_PATTERN = re.compile(r'///[^\n]*|//![^\n]*')
def has_item_with_examples(content, item_name, item_type):
"""Check if a public item has at least one worked example."""
# Look for the item and its associated doc comments
# This is a simplified check - we look for doc comments with code blocks
# near the item declaration
# Split by item and look for doc comments immediately before
lines = content.split('\n')
item_line = None
for i, line in enumerate(lines):
if item_name in line and any(f'pub {t}' in line for t in ['fn', 'struct', 'enum', 'trait', 'mod', 'type', 'const', 'static']):
item_line = i
break
if item_line is None:
return False
# Look backwards for doc comments
doc_lines = []
for i in range(item_line - 1, max(0, item_line - 50), -1):
line = lines[i].strip()
if line.startswith('///') or line.startswith('//!'):
doc_lines.insert(0, line)
elif line and not line.startswith('//') and not line.startswith('#['):
# Stop at non-comment, non-attribute line
break
doc_content = '\n'.join(doc_lines)
# Check for code blocks
return bool(EXAMPLE_PATTERN.search(doc_content))
def find_public_items_in_file(filepath):
"""Find all public items in a Rust source file."""
content = filepath.read_text()
items = []
for item_type, pattern in PUBLIC_PATTERNS.items():
for match in pattern.finditer(content):
item_name = match.group(1)
# Skip common non-public items
if item_name.startswith('_'):
continue
items.append((item_type, item_name, match.start()))
return items, content
def scan_crate(src_path):
"""Scan the crate for public items and example coverage."""
src_path = Path(src_path)
results = {
'total_items': 0,
'items_with_examples': 0,
'by_type': defaultdict(lambda: {'total': 0, 'with_examples': 0}),
'files': {}
}
# Get all .rs files
rs_files = list(src_path.rglob('*.rs'))
for rs_file in rs_files:
# Skip build.rs and tests
if 'build.rs' in str(rs_file) or 'tests/' in str(rs_file):
continue
try:
items, content = find_public_items_in_file(rs_file)
if items:
file_results = {
'total': len(items),
'with_examples': 0,
'items': []
}
for item_type, item_name, _ in items:
results['total_items'] += 1
results['by_type'][item_type]['total'] += 1
file_results['total'] += 1
has_examples = has_item_with_examples(content, item_name, item_type)
file_results['items'].append({
'name': item_name,
'type': item_type,
'has_examples': has_examples
})
if has_examples:
results['items_with_examples'] += 1
results['by_type'][item_type]['with_examples'] += 1
file_results['with_examples'] += 1
results['files'][str(rs_file.relative_to(src_path.parent.parent))] = file_results
except Exception as e:
print(f"Error processing {rs_file}: {e}", flush=True)
return results
def main():
pdftract_core = Path('/home/coding/pdftract/crates/pdftract-core/src')
results = scan_crate(pdftract_core)
coverage = (results['items_with_examples'] / results['total_items'] * 100) if results['total_items'] > 0 else 0
print("=" * 60)
print(f"Rustdoc Example Coverage Report for pdftract-core")
print("=" * 60)
print(f"\nTotal public items: {results['total_items']}")
print(f"Items with examples: {results['items_with_examples']}")
print(f"Coverage: {coverage:.1f}%")
print(f"\nTarget: 80%")
print(f"Status: {'✓ PASS' if coverage >= 80 else '✗ FAIL'}")
print("\n" + "=" * 60)
print("Coverage by Type")
print("=" * 60)
for item_type, counts in sorted(results['by_type'].items()):
total = counts['total']
with_ex = counts['with_examples']
cov = (with_ex / total * 100) if total > 0 else 0
print(f"{item_type:12} {with_ex:4}/{total:4} ({cov:5.1f}%) {'' if cov >= 80 else ''}")
# Show files that need work
print("\n" + "=" * 60)
print("Files Needing Examples (showing items without examples)")
print("=" * 60)
for file_path, file_results in sorted(results['files'].items()):
file_cov = (file_results['with_examples'] / file_results['total'] * 100) if file_results['total'] > 0 else 0
missing = [item for item in file_results['items'] if not item['has_examples']]
if missing and file_cov < 80:
print(f"\n{file_path} ({file_cov:.0f}% coverage)")
for item in sorted(missing, key=lambda x: (x['type'], x['name'])):
print(f" - {item['type']:8} {item['name']}")
print("\n" + "=" * 60)
# Output JSON for scripts
output_json = {
'coverage': coverage,
'total_items': results['total_items'],
'items_with_examples': results['items_with_examples'],
'pass': coverage >= 80
}
json_path = Path('/tmp/doc_example_coverage.json')
json_path.write_text(json.dumps(output_json, indent=2))
return 0 if coverage >= 80 else 1
if __name__ == '__main__':
exit(main())