pdftract/audit_docs.py
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

110 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""
Audit script to find public items in pdftract-core that are missing documentation.
"""
import re
import subprocess
from pathlib import Path
from collections import defaultdict
PUBLIC_PATTERNS = [
(r'pub fn (\w+)', 'function'),
(r'pub struct (\w+)', 'struct'),
(r'pub enum (\w+)', 'enum'),
(r'pub trait (\w+)', 'trait'),
(r'pub type (\w+)', 'type'),
(r'pub const (\w+)', 'const'),
(r'pub mod (\w+)', 'module'),
(r'pub (?:static|async) (\w+)', 'other'),
]
def has_doc_comment(lines, line_idx):
"""Check if there's a doc comment before the given line."""
for i in range(line_idx - 1, -1, -1):
line = lines[i].strip()
if line.startswith('///') or line.startswith('//!'):
return True
if line and not line.startswith('//') and not line.startswith('#'):
break
return False
def audit_file(filepath):
"""Audit a single Rust file for missing documentation."""
items = []
lines = filepath.read_text(encoding='utf-8').split('\n')
for line_idx, line in enumerate(lines):
for pattern, item_type in PUBLIC_PATTERNS:
match = re.search(pattern, line)
if match:
item_name = match.group(1)
has_docs = has_doc_comment(lines, line_idx)
items.append({
'name': item_name,
'type': item_type,
'has_docs': has_docs,
'line': line_idx + 1,
'file': str(filepath.relative_to('/home/coding/pdftract/crates/pdftract-core/src'))
})
return items
def main():
src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')
all_items = []
for rs_file in sorted(src_dir.rglob('*.rs')):
all_items.extend(audit_file(rs_file))
# Group by type and coverage
by_type = defaultdict(lambda: {'total': 0, 'with_docs': 0, 'missing': []})
for item in all_items:
by_type[item['type']]['total'] += 1
if item['has_docs']:
by_type[item['type']]['with_docs'] += 1
else:
by_type[item['type']]['missing'].append(item)
# Print summary
print("=" * 60)
print("PDFTRACT-CORE DOCUMENTATION AUDIT")
print("=" * 60)
print()
total_items = len(all_items)
total_with_docs = sum(1 for i in all_items if i['has_docs'])
print(f"TOTAL PUBLIC ITEMS: {total_items}")
print(f"WITH DOCUMENTATION: {total_with_docs} ({100 * total_with_docs / total_items:.1f}%)")
print(f"MISSING DOCUMENTATION: {total_items - total_with_docs} ({100 * (total_items - total_with_docs) / total_items:.1f}%)")
print()
print("BY TYPE:")
print("-" * 40)
for item_type, data in sorted(by_type.items()):
coverage = 100 * data['with_docs'] / data['total'] if data['total'] > 0 else 0
print(f"{item_type:12}: {data['with_docs']:4}/{data['total']:<4} ({coverage:5.1f}%)")
print()
# Print top missing items
if any(by_type[t]['missing'] for t in by_type):
print("TOP ITEMS MISSING DOCS (first 20 by type):")
print("-" * 40)
for item_type in sorted(by_type.keys()):
missing = by_type[item_type]['missing'][:10]
for item in missing:
print(f" [{item_type}] {item['name']} at {item['file']}:{item['line']}")
print()
print("=" * 60)
# Return exit code based on 80% threshold
coverage = 100 * total_with_docs / total_items if total_items > 0 else 0
if coverage >= 80:
print(f"✓ PASS: {coverage:.1f}% coverage meets 80% threshold")
return 0
else:
print(f"✗ FAIL: {coverage:.1f}% coverage below 80% threshold")
return 1
if __name__ == '__main__':
exit(main())