miroir/docs/research/score-comparability/corpus/generate.py
jedarden 360378bde2 P11.8: Amend plan §12 to reflect Rust-idiomatic test layout
The plan §12 previously specified tests/ at root with integration/
and chaos/ subdirectories. However, the actual implementation uses
the idiomatic Rust convention with tests in crates/*/tests/.

This commit:
- Updates plan §12 repository structure to document the actual layout
- Moves tests/benches/score-comparability to docs/research/ (research artifacts)
- Removes the now-empty tests/ directory

CI already runs cargo test --all --all-features which correctly
discovers and runs all crate-level integration tests.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 06:49:04 -04:00

212 lines
7.5 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Generate test corpus for score comparability experiments.
Creates a synthetic document collection with:
- Controlled vocabulary (50 unique terms)
- Skewable shard distribution
- Realistic term frequency distributions following Zipf's law
"""
import argparse
import json
import random
from pathlib import Path
from typing import List, Dict
def generate_vocabulary(size: int = 50) -> List[str]:
"""Generate unique terms for the corpus."""
categories = ["tech", "finance", "science", "health", "business"]
terms = []
# Add some category-specific terms
cat_terms = {
"tech": ["algorithm", "database", "server", "cloud", "network", "api", "code", "software"],
"finance": ["stock", "market", "investment", "portfolio", "dividend", "yield", "asset", "trading"],
"science": ["research", "experiment", "hypothesis", "data", "analysis", "theory", "laboratory", "discovery"],
"health": ["treatment", "patient", "diagnosis", "symptom", "therapy", "medicine", "clinical", "wellness"],
"business": ["strategy", "revenue", "customer", "product", "service", "growth", "operations", "management"],
}
for cat, cat_term_list in cat_terms.items():
terms.extend(cat_term_list)
# Add general terms
general_terms = [
"system", "process", "method", "approach", "solution", "platform", "framework",
"model", "design", "implementation", "development", "deployment", "architecture",
"performance", "scalability", "reliability", "security", "integration", "configuration",
"monitoring", "testing", "validation", "optimization", "automation", "documentation"
]
terms.extend(general_terms[: size - len(terms)])
return terms[:size]
def zipf_distribution(n: int, s: float = 1.0) -> List[float]:
"""Generate Zipf distribution for term frequencies."""
# Normalize: probability of rank i is proportional to 1/(i+1)^s
ranks = list(range(1, n + 1))
weights = [1.0 / (r ** s) for r in ranks]
total = sum(weights)
return [w / total for w in weights]
def generate_documents(
count: int,
vocabulary: List[str],
categories: List[str],
avg_doc_length: int = 50,
) -> List[Dict]:
"""Generate synthetic documents."""
vocab_size = len(vocabulary)
zipf_weights = zipf_distribution(vocab_size, s=1.2)
documents = []
for i in range(count):
category = random.choice(categories)
# Choose terms for this document using weighted sampling
# Term count follows Poisson-like distribution
term_count = max(5, int(random.gauss(avg_doc_length, avg_doc_length / 4)))
doc_terms = random.choices(vocabulary, weights=zipf_weights, k=term_count)
# Ensure some category-specific terms appear
cat_related = [t for t in vocabulary if t.lower() in category.lower() or
any(c in t.lower() for c in category.lower().split())]
if cat_related and random.random() < 0.7:
doc_terms[0] = random.choice(cat_related)
# Create title (first 3-5 terms)
title_length = random.randint(3, 5)
title_terms = doc_terms[:title_length]
title = " ".join(title_terms).title()
# Create content (all terms)
content = " ".join(doc_terms).capitalize()
documents.append({
"id": f"doc-{i:06d}",
"title": title,
"content": content,
"category": category,
})
return documents
def assign_shards_skewed(
documents: List[Dict],
shard_count: int,
skew_factors: List[float],
) -> Dict[int, List[Dict]]:
"""
Assign documents to shards with controlled skew.
skew_factors[i] is the relative size multiplier for shard i.
Normal shard = 1.0, 100× larger = 100.0, 0.01× smaller = 0.01
"""
total_docs = len(documents)
# Calculate target counts per shard
base_per_shard = total_docs / (shard_count + sum(f - 1 for f in skew_factors))
shard_targets = [int(base_per_shard * f) for f in skew_factors]
# Normalize to total count
total_target = sum(shard_targets)
shard_targets = [int(t * total_docs / total_target) for t in shard_targets]
# Ensure sum equals total
while sum(shard_targets) < total_docs:
shard_targets[random.randint(0, shard_count - 1)] += 1
# Shuffle documents for random assignment
shuffled = documents.copy()
random.shuffle(shuffled)
# Assign to shards
shards = {}
idx = 0
for shard_id, target in enumerate(shard_targets):
shards[shard_id] = shuffled[idx:idx + target]
idx += target
return shards
def main():
parser = argparse.ArgumentParser(description="Generate test corpus for score comparability")
parser.add_argument("--count", type=int, default=100000, help="Number of documents to generate")
parser.add_argument("--shards", type=int, default=10, help="Number of shards")
parser.add_argument("--output", type=str, default="corpus/", help="Output directory")
parser.add_argument("--vocab-size", type=int, default=50, help="Vocabulary size")
parser.add_argument("--categories", type=str,
default="tech,finance,science,health,business",
help="Comma-separated list of categories")
args = parser.parse_args()
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
categories = args.categories.split(",")
print(f"Generating {args.count} documents...")
print(f"Vocabulary size: {args.vocab_size}")
print(f"Categories: {categories}")
print(f"Shards: {args.shards}")
# Generate vocabulary
vocabulary = generate_vocabulary(args.vocab_size)
with open(output_dir / "vocabulary.json", "w") as f:
json.dump({"terms": vocabulary, "categories": categories}, f, indent=2)
# Generate documents
documents = generate_documents(args.count, vocabulary, categories)
# Define skew factors for this experiment
# Shard 0: normal (1.0)
# Shard 1: 100× normal (100.0) - extreme outlier
# Shard 2-7: normal (1.0)
# Shard 8: slightly skewed (0.5)
# Shard 9: 0.01× normal (0.01) - tiny shard
skew_factors = [1.0, 100.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.01]
skew_factors = skew_factors[:args.shards]
# Assign to shards
shards = assign_shards_skewed(documents, args.shards, skew_factors)
# Save combined corpus (for ground truth)
with open(output_dir / "corpus.jsonl", "w") as f:
for doc in documents:
f.write(json.dumps(doc) + "\n")
# Save per-shard corpora
for shard_id, shard_docs in shards.items():
filename = output_dir / f"shard-{shard_id:02d}.jsonl"
with open(filename, "w") as f:
for doc in shard_docs:
f.write(json.dumps(doc) + "\n")
print(f" Shard {shard_id}: {len(shard_docs)} documents (skew factor: {skew_factors[shard_id]})")
# Save metadata
metadata = {
"total_documents": args.count,
"shard_count": args.shards,
"vocabulary_size": args.vocab_size,
"categories": categories,
"skew_factors": skew_factors,
"shard_sizes": {str(k): len(v) for k, v in shards.items()},
}
with open(output_dir / "metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
print(f"\nCorpus generated successfully in {output_dir}")
print(f" Total documents: {args.count}")
print(f" Vocabulary size: {len(vocabulary)}")
print(f" Categories: {len(categories)}")
if __name__ == "__main__":
main()