The extract_markdown stub was calling extract_text instead of extract_text_fn, causing a compilation error. This fixes the function name to match the exported function from extract_text.rs. This completes the extract_text PyO3 entry point implementation, which was already present in extract_text.rs and lib.rs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
110 lines
3.7 KiB
Python
110 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Audit script to find public items in pdftract-core that are missing documentation.
|
|
"""
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
PUBLIC_PATTERNS = [
|
|
(r'pub fn (\w+)', 'function'),
|
|
(r'pub struct (\w+)', 'struct'),
|
|
(r'pub enum (\w+)', 'enum'),
|
|
(r'pub trait (\w+)', 'trait'),
|
|
(r'pub type (\w+)', 'type'),
|
|
(r'pub const (\w+)', 'const'),
|
|
(r'pub mod (\w+)', 'module'),
|
|
(r'pub (?:static|async) (\w+)', 'other'),
|
|
]
|
|
|
|
def has_doc_comment(lines, line_idx):
|
|
"""Check if there's a doc comment before the given line."""
|
|
for i in range(line_idx - 1, -1, -1):
|
|
line = lines[i].strip()
|
|
if line.startswith('///') or line.startswith('//!'):
|
|
return True
|
|
if line and not line.startswith('//') and not line.startswith('#'):
|
|
break
|
|
return False
|
|
|
|
def audit_file(filepath):
|
|
"""Audit a single Rust file for missing documentation."""
|
|
items = []
|
|
lines = filepath.read_text(encoding='utf-8').split('\n')
|
|
|
|
for line_idx, line in enumerate(lines):
|
|
for pattern, item_type in PUBLIC_PATTERNS:
|
|
match = re.search(pattern, line)
|
|
if match:
|
|
item_name = match.group(1)
|
|
has_docs = has_doc_comment(lines, line_idx)
|
|
items.append({
|
|
'name': item_name,
|
|
'type': item_type,
|
|
'has_docs': has_docs,
|
|
'line': line_idx + 1,
|
|
'file': str(filepath.relative_to('/home/coding/pdftract/crates/pdftract-core/src'))
|
|
})
|
|
return items
|
|
|
|
def main():
|
|
src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')
|
|
|
|
all_items = []
|
|
for rs_file in sorted(src_dir.rglob('*.rs')):
|
|
all_items.extend(audit_file(rs_file))
|
|
|
|
# Group by type and coverage
|
|
by_type = defaultdict(lambda: {'total': 0, 'with_docs': 0, 'missing': []})
|
|
for item in all_items:
|
|
by_type[item['type']]['total'] += 1
|
|
if item['has_docs']:
|
|
by_type[item['type']]['with_docs'] += 1
|
|
else:
|
|
by_type[item['type']]['missing'].append(item)
|
|
|
|
# Print summary
|
|
print("=" * 60)
|
|
print("PDFTRACT-CORE DOCUMENTATION AUDIT")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
total_items = len(all_items)
|
|
total_with_docs = sum(1 for i in all_items if i['has_docs'])
|
|
|
|
print(f"TOTAL PUBLIC ITEMS: {total_items}")
|
|
print(f"WITH DOCUMENTATION: {total_with_docs} ({100 * total_with_docs / total_items:.1f}%)")
|
|
print(f"MISSING DOCUMENTATION: {total_items - total_with_docs} ({100 * (total_items - total_with_docs) / total_items:.1f}%)")
|
|
print()
|
|
|
|
print("BY TYPE:")
|
|
print("-" * 40)
|
|
for item_type, data in sorted(by_type.items()):
|
|
coverage = 100 * data['with_docs'] / data['total'] if data['total'] > 0 else 0
|
|
print(f"{item_type:12}: {data['with_docs']:4}/{data['total']:<4} ({coverage:5.1f}%)")
|
|
print()
|
|
|
|
# Print top missing items
|
|
if any(by_type[t]['missing'] for t in by_type):
|
|
print("TOP ITEMS MISSING DOCS (first 20 by type):")
|
|
print("-" * 40)
|
|
for item_type in sorted(by_type.keys()):
|
|
missing = by_type[item_type]['missing'][:10]
|
|
for item in missing:
|
|
print(f" [{item_type}] {item['name']} at {item['file']}:{item['line']}")
|
|
|
|
print()
|
|
print("=" * 60)
|
|
|
|
# Return exit code based on 80% threshold
|
|
coverage = 100 * total_with_docs / total_items if total_items > 0 else 0
|
|
if coverage >= 80:
|
|
print(f"✓ PASS: {coverage:.1f}% coverage meets 80% threshold")
|
|
return 0
|
|
else:
|
|
print(f"✗ FAIL: {coverage:.1f}% coverage below 80% threshold")
|
|
return 1
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|