pdftract/tools/count_docs.py
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

100 lines
3 KiB
Python

#!/usr/bin/env python3
"""Count rustdoc coverage for pdftract-core."""
import os
import re
from pathlib import Path
from collections import defaultdict
CORE_DIR = Path("crates/pdftract-core/src")
# Patterns for public items
PUB_PATTERNS = {
"fn": re.compile(r'^pub (?:async\s+)?fn\s+(\w+)'),
"struct": re.compile(r'^pub struct\s+(\w+)'),
"enum": re.compile(r'^pub enum\s+(\w+)'),
"trait": re.compile(r'^pub trait\s+(\w+)'),
"type": re.compile(r'^pub type\s+(\w+)'),
"mod": re.compile(r'^pub mod\s+(\w+)'),
"const": re.compile(r'^pub const\s+(\w+)'),
"static": re.compile(r'^pub static\s+(\w+)'),
}
# Pattern for code blocks in doc comments
EXAMPLE_PATTERN = re.compile(r'```rust[^`]*```')
DOC_COMMENT_PATTERN = re.compile(r'///.*|//!.*')
def count_public_items_and_examples(file_path: Path):
"""Count public items and examples in a single file."""
with open(file_path) as f:
lines = f.readlines()
pub_items = []
i = 0
while i < len(lines):
line = lines[i]
# Look for public items
for item_type, pattern in PUB_PATTERNS.items():
match = pattern.match(line.strip())
if match:
item_name = match.group(1)
pub_items.append({
"type": item_type,
"name": item_name,
"line": i + 1,
"has_example": False
})
break
i += 1
# Now check each pub item for examples
# This is simplified - we need to scan doc comments before each item
for item in pub_items:
line_idx = item["line"] - 1
# Scan backwards for doc comments
doc_lines = []
j = line_idx - 1
while j >= 0 and (lines[j].strip().startswith("///") or lines[j].strip().startswith("//!")):
doc_lines.insert(0, lines[j])
j -= 1
# Check if any doc comment contains a code block
doc_text = "".join(doc_lines)
if EXAMPLE_PATTERN.search(doc_text):
item["has_example"] = True
return pub_items
def main():
all_items = []
for rs_file in CORE_DIR.rglob("*.rs"):
# Skip lib.rs top-level module exports
if rs_file.name == "lib.rs":
continue
items = count_public_items_and_examples(rs_file)
all_items.extend(items)
total = len(all_items)
with_examples = sum(1 for item in all_items if item["has_example"])
coverage = (with_examples / total * 100) if total > 0 else 0
print(f"Total public items: {total}")
print(f"With worked examples: {with_examples}")
print(f"Coverage: {coverage:.1f}%")
# Breakdown by type
by_type = defaultdict(list)
for item in all_items:
by_type[item["type"]].append(item)
print("\nBy type:")
for item_type, items in sorted(by_type.items()):
with_ex = sum(1 for i in items if i["has_example"])
print(f" {item_type}: {with_ex}/{len(items)} ({with_ex/len(items)*100:.1f}%)")
if __name__ == "__main__":
main()