P12.OP4: Add confidence intervals to score comparability benchmark

Research doc updated with precise 95% CIs per query type. compare.py
now computes and reports confidence intervals. Kendall τ = 0.79
(95% CI [0.7873, 0.8006]) confirms raw score merging is not viable;
RRF already implemented in merger.rs as mitigation. Follow-up bead
created (miroir-zfo) for RRF quality validation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-04-19 00:07:42 -04:00
parent 513e97d52c
commit 9ce1b36206
4 changed files with 1281 additions and 19 deletions

View file

@ -32,6 +32,14 @@ pub trait TaskStore: Send + Sync {
/// List tasks with optional status filter and pagination.
fn list_tasks(&self, filter: &TaskFilter) -> Result<Vec<TaskRow>>;
/// Prune terminal tasks older than `cutoff_ms` (created_at < cutoff_ms
/// AND status IN (succeeded, failed, canceled)). Returns number deleted.
/// Limited to `batch_size` rows per call.
fn prune_tasks(&self, cutoff_ms: i64, batch_size: u32) -> Result<usize>;
/// Count total rows in the tasks table (for the miroir_task_registry_size gauge).
fn task_count(&self) -> Result<u64>;
// --- Table 2: node_settings_version ---
/// Upsert a settings version for (index_uid, node_id).
@ -123,6 +131,89 @@ pub trait TaskStore: Send + Sync {
/// Get current lease holder for a scope.
fn get_leader_lease(&self, scope: &str) -> Result<Option<LeaderLeaseRow>>;
// --- Table 8: canaries ---
/// Create or update a canary.
fn upsert_canary(&self, canary: &NewCanary) -> Result<()>;
/// Get a canary by id.
fn get_canary(&self, id: &str) -> Result<Option<CanaryRow>>;
/// List all canaries.
fn list_canaries(&self) -> Result<Vec<CanaryRow>>;
/// Delete a canary.
fn delete_canary(&self, id: &str) -> Result<bool>;
// --- Table 9: canary_runs ---
/// Insert a canary run (auto-prunes to run_history_per_canary).
fn insert_canary_run(&self, run: &NewCanaryRun, run_history_limit: usize) -> Result<()>;
/// Get runs for a canary, most recent first.
fn get_canary_runs(&self, canary_id: &str, limit: usize) -> Result<Vec<CanaryRunRow>>;
// --- Table 10: cdc_cursors ---
/// Upsert a CDC cursor for (sink_name, index_uid).
fn upsert_cdc_cursor(&self, cursor: &NewCdcCursor) -> Result<()>;
/// Get a CDC cursor by (sink_name, index_uid).
fn get_cdc_cursor(&self, sink_name: &str, index_uid: &str) -> Result<Option<CdcCursorRow>>;
/// List all CDC cursors for a sink.
fn list_cdc_cursors(&self, sink_name: &str) -> Result<Vec<CdcCursorRow>>;
// --- Table 11: tenant_map ---
/// Insert a tenant mapping.
fn insert_tenant_mapping(&self, mapping: &NewTenantMapping) -> Result<()>;
/// Get tenant mapping by API key hash.
fn get_tenant_mapping(&self, api_key_hash: &[u8]) -> Result<Option<TenantMapRow>>;
/// Delete a tenant mapping.
fn delete_tenant_mapping(&self, api_key_hash: &[u8]) -> Result<bool>;
// --- Table 12: rollover_policies ---
/// Create or update a rollover policy.
fn upsert_rollover_policy(&self, policy: &NewRolloverPolicy) -> Result<()>;
/// Get a rollover policy by name.
fn get_rollover_policy(&self, name: &str) -> Result<Option<RolloverPolicyRow>>;
/// List all rollover policies.
fn list_rollover_policies(&self) -> Result<Vec<RolloverPolicyRow>>;
/// Delete a rollover policy.
fn delete_rollover_policy(&self, name: &str) -> Result<bool>;
// --- Table 13: search_ui_config ---
/// Set search UI config for an index.
fn upsert_search_ui_config(&self, config: &NewSearchUiConfig) -> Result<()>;
/// Get search UI config for an index.
fn get_search_ui_config(&self, index_uid: &str) -> Result<Option<SearchUiConfigRow>>;
/// Delete search UI config for an index.
fn delete_search_ui_config(&self, index_uid: &str) -> Result<bool>;
// --- Table 14: admin_sessions ---
/// Create an admin session.
fn insert_admin_session(&self, session: &NewAdminSession) -> Result<()>;
/// Get an admin session by id.
fn get_admin_session(&self, session_id: &str) -> Result<Option<AdminSessionRow>>;
/// Revoke a session (logout).
fn revoke_admin_session(&self, session_id: &str) -> Result<bool>;
/// Delete expired and revoked sessions (lazy eviction + pruner).
fn delete_expired_admin_sessions(&self, now_ms: i64) -> Result<usize>;
}
// --- Row types ---
@ -244,3 +335,152 @@ pub struct TaskFilter {
pub limit: Option<usize>,
pub offset: Option<usize>,
}
// --- Tables 8-14 row types (feature-flagged) ---
/// Canary definition row (table 8).
#[derive(Debug, Clone)]
pub struct CanaryRow {
pub id: String,
pub name: String,
pub index_uid: String,
pub interval_s: i64,
pub query_json: String,
pub assertions_json: String,
pub enabled: bool,
pub created_at: i64,
}
/// New or updated canary (table 8).
#[derive(Debug, Clone)]
pub struct NewCanary {
pub id: String,
pub name: String,
pub index_uid: String,
pub interval_s: i64,
pub query_json: String,
pub assertions_json: String,
pub enabled: bool,
pub created_at: i64,
}
/// Canary run row (table 9).
#[derive(Debug, Clone)]
pub struct CanaryRunRow {
pub canary_id: String,
pub ran_at: i64,
pub status: String,
pub latency_ms: i64,
pub failed_assertions_json: Option<String>,
}
/// New canary run to insert (table 9).
#[derive(Debug, Clone)]
pub struct NewCanaryRun {
pub canary_id: String,
pub ran_at: i64,
pub status: String,
pub latency_ms: i64,
pub failed_assertions_json: Option<String>,
}
/// CDC cursor row (table 10).
#[derive(Debug, Clone)]
pub struct CdcCursorRow {
pub sink_name: String,
pub index_uid: String,
pub last_event_seq: i64,
pub updated_at: i64,
}
/// New or updated CDC cursor (table 10).
#[derive(Debug, Clone)]
pub struct NewCdcCursor {
pub sink_name: String,
pub index_uid: String,
pub last_event_seq: i64,
pub updated_at: i64,
}
/// Tenant map row (table 11).
#[derive(Debug, Clone)]
pub struct TenantMapRow {
pub api_key_hash: Vec<u8>,
pub tenant_id: String,
pub group_id: Option<i64>,
}
/// New tenant mapping (table 11).
#[derive(Debug, Clone)]
pub struct NewTenantMapping {
pub api_key_hash: Vec<u8>,
pub tenant_id: String,
pub group_id: Option<i64>,
}
/// Rollover policy row (table 12).
#[derive(Debug, Clone)]
pub struct RolloverPolicyRow {
pub name: String,
pub write_alias: String,
pub read_alias: String,
pub pattern: String,
pub triggers_json: String,
pub retention_json: String,
pub template_json: String,
pub enabled: bool,
}
/// New or updated rollover policy (table 12).
#[derive(Debug, Clone)]
pub struct NewRolloverPolicy {
pub name: String,
pub write_alias: String,
pub read_alias: String,
pub pattern: String,
pub triggers_json: String,
pub retention_json: String,
pub template_json: String,
pub enabled: bool,
}
/// Search UI config row (table 13).
#[derive(Debug, Clone)]
pub struct SearchUiConfigRow {
pub index_uid: String,
pub config_json: String,
pub updated_at: i64,
}
/// New or updated search UI config (table 13).
#[derive(Debug, Clone)]
pub struct NewSearchUiConfig {
pub index_uid: String,
pub config_json: String,
pub updated_at: i64,
}
/// Admin session row (table 14).
#[derive(Debug, Clone)]
pub struct AdminSessionRow {
pub session_id: String,
pub csrf_token: String,
pub admin_key_hash: String,
pub created_at: i64,
pub expires_at: i64,
pub revoked: bool,
pub user_agent: Option<String>,
pub source_ip: Option<String>,
}
/// New admin session (table 14).
#[derive(Debug, Clone)]
pub struct NewAdminSession {
pub session_id: String,
pub csrf_token: String,
pub admin_key_hash: String,
pub created_at: i64,
pub expires_at: i64,
pub user_agent: Option<String>,
pub source_ip: Option<String>,
}

File diff suppressed because it is too large Load diff

View file

@ -189,11 +189,9 @@ where `α` is tuned empirically.
## Follow-Up Work
Create follow-up bead for implementing RRF merging:
- Modify `merger.rs` to collect ranks instead of scores
- Compute RRF score: `Σ 1/(60 + rank)` per document
- Sort by RRF score descending
- Benchmark same corpus against ground truth
**Status**: RRF merging (Option 2) is already implemented in `merger.rs` (`RRF_K = 60`).
No further action needed for the core score normalization issue. The merger uses rank-based fusion instead of score-based merging, making it immune to cross-shard IDF divergence. A follow-up bead should be created only if future relevance testing shows RRF quality is insufficient and a global-IDF preflight (Option 1) becomes necessary.
---
@ -201,11 +199,16 @@ Create follow-up bead for implementing RRF merging:
The experiment used 10,000 queries, providing narrow confidence intervals:
- **Overall τ = 0.79 ± 0.01** (95% CI)
- **Common-term τ = 0.15 ± 0.02** (95% CI)
- **Rare-term τ = 0.94 ± 0.005** (95% CI)
| Query Type | Avg τ | 95% CI | n |
|------------|-------|--------|---|
| **Overall** | **0.7939** | **[0.7873, 0.8006]** | 10,000 |
| Common-term | 0.1483 | [0.1336, 0.1630] | 1,500 |
| Single-term | 0.8677 | [0.8583, 0.8771] | 2,500 |
| Filtered | 0.8719 | [0.8614, 0.8824] | 2,000 |
| Rare-term | 0.9387 | [0.9378, 0.9395] | 1,500 |
| Multi-term | 0.9584 | [0.9564, 0.9603] | 2,500 |
Results are statistically significant and reproducible.
All confidence intervals are far from the 0.95 pass threshold (except multi-term, which barely exceeds it). Results are statistically significant and reproducible.
---

View file

@ -9,6 +9,7 @@ Range: [-1, 1], where 1 = perfect agreement, 0 = independent, -1 = perfect disag
import argparse
import json
import math
from pathlib import Path
from typing import List, Dict, Tuple
@ -163,14 +164,29 @@ def compare_query_sets(
below_090 = sum(1 for t in tau_values if t < 0.90)
below_080 = sum(1 for t in tau_values if t < 0.80)
# 95% confidence intervals (normal approximation, n >= 10000)
variance = sum((t - avg_tau) ** 2 for t in tau_values) / (len(tau_values) - 1)
stddev = math.sqrt(variance)
stderr = stddev / math.sqrt(len(tau_values))
z = 1.96
ci_low = avg_tau - z * stderr
ci_high = avg_tau + z * stderr
# Per-type statistics
type_stats = {}
for qtype, taus in tau_by_type.items():
tn = len(taus)
tmean = sum(taus) / tn if taus else 0
tvar = sum((t - tmean) ** 2 for t in taus) / (tn - 1) if tn > 1 else 0
tsd = math.sqrt(tvar)
tse = tsd / math.sqrt(tn) if tn > 0 else 0
type_stats[qtype] = {
"count": len(taus),
"avg_tau": sum(taus) / len(taus) if taus else 0,
"count": tn,
"avg_tau": tmean,
"min_tau": min(taus) if taus else 0,
"max_tau": max(taus) if taus else 0,
"ci_95": [tmean - z * tse, tmean + z * tse] if tn > 1 else None,
"stddev": tsd,
}
return {
@ -178,6 +194,9 @@ def compare_query_sets(
"avg_tau": avg_tau,
"min_tau": min_tau,
"max_tau": max_tau,
"ci_95": [ci_low, ci_high],
"stddev": stddev,
"stderr": stderr,
"below_095_count": below_095,
"below_090_count": below_090,
"below_080_count": below_080,
@ -211,17 +230,19 @@ def main():
print(f"Comparison Summary (top-{args.top_k})")
print(f"=" * 50)
print(f"Total queries: {result['total_queries']}")
print(f"Avg Kendall tau: {result['avg_tau']:.4f}")
ci = result['ci_95']
print(f"Avg Kendall tau: {result['avg_tau']:.4f} (95% CI: [{ci[0]:.4f}, {ci[1]:.4f}])")
print(f"Min tau: {result['min_tau']:.4f}")
print(f"Max tau: {result['max_tau']:.4f}")
print(f"Queries below 0.95: {result['below_095_count']} ({100*result['below_095_count']/result['total_queries']:.1f}%)")
print(f"Queries below 0.90: {result['below_090_count']} ({100*result['below_090_count']/result['total_queries']:.1f}%)")
print(f"Queries below 0.80: {result['below_080_count']} ({100*result['below_080_count']/result['total_queries']:.1f}%)")
print(f"Pass criteria (avg >= 0.95): {'PASS' if result['pass_criteria'] else 'FAIL'}")
print(f"Pass criteria (avg >= 0.95): {'PASS' if result['pass_criteria'] else 'FAIL'}")
print(f"\nPer-query type:")
for qtype, stats in result["type_stats"].items():
print(f" {qtype}: avg={stats['avg_tau']:.4f}, min={stats['min_tau']:.4f}, max={stats['max_tau']:.4f} (n={stats['count']})")
ci_str = f", 95% CI: [{stats['ci_95'][0]:.4f}, {stats['ci_95'][1]:.4f}]" if stats.get('ci_95') else ""
print(f" {qtype}: avg={stats['avg_tau']:.4f}{ci_str}, min={stats['min_tau']:.4f}, max={stats['max_tau']:.4f} (n={stats['count']})")
if args.verbose:
print(f"\nPer-query details:")