pdftract/scripts/doc_analysis.py
jedarden 246befd8d1 feat(pdftract-2m3gl): implement PHP SDK with Packagist publishing
- Add jedarden/pdftract Composer package (sdk/php/)
- Implement Client.php with proc_open subprocess execution
- Add PSR-3 LoggerInterface integration (defaults to NullLogger)
- Add 9 contract methods: extract, extractText, extractMarkdown, extractStream, search, getMetadata, hash, classify, verifyReceipt
- Add readonly model classes: Document, Page, Metadata, Fingerprint, Classification, Match, Receipt
- Add exception classes: PdftractException base + 8 subclasses
- Add PHPUnit conformance test suite
- Add phpunit.xml configuration
- Add composer.json with jedarden/pdftract package name
- Add .ci/argo-workflows/pdftract-php-publish.yaml (Packagist auto-discovery from git tags)

Also includes Ruby SDK scaffold from parallel workflow.

Closes pdftract-2m3gl
2026-06-01 10:27:03 -04:00

176 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""Analyze rustdoc coverage for pdftract-core public API."""
import os
import re
from pathlib import Path
from collections import defaultdict
def extract_items_with_docs(file_path):
"""Extract public items and their documentation status from a Rust file."""
content = file_path.read_text()
lines = content.split('\n')
items = []
i = 0
while i < len(lines):
line = lines[i]
# Skip comments and empty lines to find next item
if line.strip().startswith('//') or not line.strip():
i += 1
continue
# Look for public items
pub_match = re.match(r'^\s*pub\s+(fn|struct|enum|trait|type|const|static|mod)\s+(\w+)', line)
if pub_match:
item_kind = pub_match.group(1)
item_name = pub_match.group(2)
# Look backwards for doc comments
has_doc = False
has_example = False
j = i - 1
doc_lines = []
while j >= 0:
prev_line = lines[j].strip()
if prev_line.startswith('///') or prev_line.startswith('//!'):
has_doc = True
doc_lines.insert(0, prev_line)
j -= 1
elif prev_line.startswith('//') or not prev_line:
j -= 1
else:
break
# Check for examples in doc
for doc_line in doc_lines:
if '```rust' in doc_line or '```no_run' in doc_line or '```ignore' in doc_line:
has_example = True
break
items.append({
'kind': item_kind,
'name': item_name,
'has_doc': has_doc,
'has_example': has_example,
'line': i + 1
})
i += 1
return items
def analyze_directory(src_dir):
"""Analyze all Rust files in a directory."""
results = {
'total_items': 0,
'with_docs': 0,
'with_examples': 0,
'by_kind': defaultdict(lambda: {'total': 0, 'docs': 0, 'examples': 0}),
'by_file': {},
}
for rs_file in Path(src_dir).rglob('*.rs'):
# Skip test files and modules.rs that just re-export
if 'test' in rs_file.name or rs_file.name == 'tests.rs':
continue
try:
items = extract_items_with_docs(rs_file)
if items:
file_results = {
'total': len(items),
'docs': 0,
'examples': 0,
'items': items
}
for item in items:
results['total_items'] += 1
results['by_kind'][item['kind']]['total'] += 1
if item['has_doc']:
results['with_docs'] += 1
file_results['docs'] += 1
results['by_kind'][item['kind']]['docs'] += 1
if item['has_example']:
results['with_examples'] += 1
file_results['examples'] += 1
results['by_kind'][item['kind']]['examples'] += 1
results['by_file'][str(rs_file)] = file_results
except Exception as e:
print(f"Error processing {rs_file}: {e}")
return results
def print_results(results):
"""Print analysis results."""
print("=" * 70)
print("PDFTRACT-CORE DOCUMENTATION COVERAGE ANALYSIS")
print("=" * 70)
print()
total = results['total_items']
with_docs = results['with_docs']
with_examples = results['with_examples']
doc_coverage = (with_docs / total * 100) if total > 0 else 0
example_coverage = (with_examples / total * 100) if total > 0 else 0
print(f"Total public items: {total}")
print(f"With documentation: {with_docs} ({doc_coverage:.1f}%)")
print(f"With examples: {with_examples} ({example_coverage:.1f}%)")
print()
print("By item type:")
print("-" * 70)
for kind in sorted(results['by_kind'].keys()):
data = results['by_kind'][kind]
cov = (data['docs'] / data['total'] * 100) if data['total'] > 0 else 0
ex_cov = (data['examples'] / data['total'] * 100) if data['total'] > 0 else 0
print(f" {kind:12} {data['total']:4} total | {data['docs']:4} docs ({cov:5.1f}%) | {data['examples']:4} examples ({ex_cov:5.1f}%)")
print()
print("Files with most undocumented items (need priority attention):")
print("-" * 70)
undocumented_files = []
for file_path, file_data in results['by_file'].items():
undocumented = file_data['total'] - file_data['docs']
if undocumented > 0:
# Get relative path from src dir
rel_path = file_path.replace('/home/coding/pdftract/crates/pdftract-core/src/', '')
undocumented_files.append((rel_path, undocumented, file_data))
undocumented_files.sort(key=lambda x: x[1], reverse=True)
for rel_path, undocumented, file_data in undocumented_files[:15]:
print(f" {rel_path:50} {undocumented:3} missing docs ({file_data['total']} total)")
print()
print("Files with most items missing examples:")
print("-" * 70)
missing_examples = []
for file_path, file_data in results['by_file'].items():
missing = file_data['total'] - file_data['examples']
if missing > 0:
rel_path = file_path.replace('/home/coding/pdftract/crates/pdftract-core/src/', '')
missing_examples.append((rel_path, missing, file_data))
missing_examples.sort(key=lambda x: x[1], reverse=True)
for rel_path, missing, file_data in missing_examples[:15]:
print(f" {rel_path:50} {missing:3} missing examples ({file_data['total']} total)")
if __name__ == '__main__':
src_dir = Path('/home/coding/pdftract/crates/pdftract-core/src')
results = analyze_directory(src_dir)
print_results(results)