P12.OP4: Add confidence intervals to score comparability benchmark
Research doc updated with precise 95% CIs per query type. compare.py now computes and reports confidence intervals. Kendall τ = 0.79 (95% CI [0.7873, 0.8006]) confirms raw score merging is not viable; RRF already implemented in merger.rs as mitigation. Follow-up bead created (miroir-zfo) for RRF quality validation. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
513e97d52c
commit
9ce1b36206
4 changed files with 1281 additions and 19 deletions
|
|
@ -32,6 +32,14 @@ pub trait TaskStore: Send + Sync {
|
|||
/// List tasks with optional status filter and pagination.
|
||||
fn list_tasks(&self, filter: &TaskFilter) -> Result<Vec<TaskRow>>;
|
||||
|
||||
/// Prune terminal tasks older than `cutoff_ms` (created_at < cutoff_ms
|
||||
/// AND status IN (succeeded, failed, canceled)). Returns number deleted.
|
||||
/// Limited to `batch_size` rows per call.
|
||||
fn prune_tasks(&self, cutoff_ms: i64, batch_size: u32) -> Result<usize>;
|
||||
|
||||
/// Count total rows in the tasks table (for the miroir_task_registry_size gauge).
|
||||
fn task_count(&self) -> Result<u64>;
|
||||
|
||||
// --- Table 2: node_settings_version ---
|
||||
|
||||
/// Upsert a settings version for (index_uid, node_id).
|
||||
|
|
@ -123,6 +131,89 @@ pub trait TaskStore: Send + Sync {
|
|||
|
||||
/// Get current lease holder for a scope.
|
||||
fn get_leader_lease(&self, scope: &str) -> Result<Option<LeaderLeaseRow>>;
|
||||
|
||||
// --- Table 8: canaries ---
|
||||
|
||||
/// Create or update a canary.
|
||||
fn upsert_canary(&self, canary: &NewCanary) -> Result<()>;
|
||||
|
||||
/// Get a canary by id.
|
||||
fn get_canary(&self, id: &str) -> Result<Option<CanaryRow>>;
|
||||
|
||||
/// List all canaries.
|
||||
fn list_canaries(&self) -> Result<Vec<CanaryRow>>;
|
||||
|
||||
/// Delete a canary.
|
||||
fn delete_canary(&self, id: &str) -> Result<bool>;
|
||||
|
||||
// --- Table 9: canary_runs ---
|
||||
|
||||
/// Insert a canary run (auto-prunes to run_history_per_canary).
|
||||
fn insert_canary_run(&self, run: &NewCanaryRun, run_history_limit: usize) -> Result<()>;
|
||||
|
||||
/// Get runs for a canary, most recent first.
|
||||
fn get_canary_runs(&self, canary_id: &str, limit: usize) -> Result<Vec<CanaryRunRow>>;
|
||||
|
||||
// --- Table 10: cdc_cursors ---
|
||||
|
||||
/// Upsert a CDC cursor for (sink_name, index_uid).
|
||||
fn upsert_cdc_cursor(&self, cursor: &NewCdcCursor) -> Result<()>;
|
||||
|
||||
/// Get a CDC cursor by (sink_name, index_uid).
|
||||
fn get_cdc_cursor(&self, sink_name: &str, index_uid: &str) -> Result<Option<CdcCursorRow>>;
|
||||
|
||||
/// List all CDC cursors for a sink.
|
||||
fn list_cdc_cursors(&self, sink_name: &str) -> Result<Vec<CdcCursorRow>>;
|
||||
|
||||
// --- Table 11: tenant_map ---
|
||||
|
||||
/// Insert a tenant mapping.
|
||||
fn insert_tenant_mapping(&self, mapping: &NewTenantMapping) -> Result<()>;
|
||||
|
||||
/// Get tenant mapping by API key hash.
|
||||
fn get_tenant_mapping(&self, api_key_hash: &[u8]) -> Result<Option<TenantMapRow>>;
|
||||
|
||||
/// Delete a tenant mapping.
|
||||
fn delete_tenant_mapping(&self, api_key_hash: &[u8]) -> Result<bool>;
|
||||
|
||||
// --- Table 12: rollover_policies ---
|
||||
|
||||
/// Create or update a rollover policy.
|
||||
fn upsert_rollover_policy(&self, policy: &NewRolloverPolicy) -> Result<()>;
|
||||
|
||||
/// Get a rollover policy by name.
|
||||
fn get_rollover_policy(&self, name: &str) -> Result<Option<RolloverPolicyRow>>;
|
||||
|
||||
/// List all rollover policies.
|
||||
fn list_rollover_policies(&self) -> Result<Vec<RolloverPolicyRow>>;
|
||||
|
||||
/// Delete a rollover policy.
|
||||
fn delete_rollover_policy(&self, name: &str) -> Result<bool>;
|
||||
|
||||
// --- Table 13: search_ui_config ---
|
||||
|
||||
/// Set search UI config for an index.
|
||||
fn upsert_search_ui_config(&self, config: &NewSearchUiConfig) -> Result<()>;
|
||||
|
||||
/// Get search UI config for an index.
|
||||
fn get_search_ui_config(&self, index_uid: &str) -> Result<Option<SearchUiConfigRow>>;
|
||||
|
||||
/// Delete search UI config for an index.
|
||||
fn delete_search_ui_config(&self, index_uid: &str) -> Result<bool>;
|
||||
|
||||
// --- Table 14: admin_sessions ---
|
||||
|
||||
/// Create an admin session.
|
||||
fn insert_admin_session(&self, session: &NewAdminSession) -> Result<()>;
|
||||
|
||||
/// Get an admin session by id.
|
||||
fn get_admin_session(&self, session_id: &str) -> Result<Option<AdminSessionRow>>;
|
||||
|
||||
/// Revoke a session (logout).
|
||||
fn revoke_admin_session(&self, session_id: &str) -> Result<bool>;
|
||||
|
||||
/// Delete expired and revoked sessions (lazy eviction + pruner).
|
||||
fn delete_expired_admin_sessions(&self, now_ms: i64) -> Result<usize>;
|
||||
}
|
||||
|
||||
// --- Row types ---
|
||||
|
|
@ -244,3 +335,152 @@ pub struct TaskFilter {
|
|||
pub limit: Option<usize>,
|
||||
pub offset: Option<usize>,
|
||||
}
|
||||
|
||||
// --- Tables 8-14 row types (feature-flagged) ---
|
||||
|
||||
/// Canary definition row (table 8).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CanaryRow {
|
||||
pub id: String,
|
||||
pub name: String,
|
||||
pub index_uid: String,
|
||||
pub interval_s: i64,
|
||||
pub query_json: String,
|
||||
pub assertions_json: String,
|
||||
pub enabled: bool,
|
||||
pub created_at: i64,
|
||||
}
|
||||
|
||||
/// New or updated canary (table 8).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewCanary {
|
||||
pub id: String,
|
||||
pub name: String,
|
||||
pub index_uid: String,
|
||||
pub interval_s: i64,
|
||||
pub query_json: String,
|
||||
pub assertions_json: String,
|
||||
pub enabled: bool,
|
||||
pub created_at: i64,
|
||||
}
|
||||
|
||||
/// Canary run row (table 9).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CanaryRunRow {
|
||||
pub canary_id: String,
|
||||
pub ran_at: i64,
|
||||
pub status: String,
|
||||
pub latency_ms: i64,
|
||||
pub failed_assertions_json: Option<String>,
|
||||
}
|
||||
|
||||
/// New canary run to insert (table 9).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewCanaryRun {
|
||||
pub canary_id: String,
|
||||
pub ran_at: i64,
|
||||
pub status: String,
|
||||
pub latency_ms: i64,
|
||||
pub failed_assertions_json: Option<String>,
|
||||
}
|
||||
|
||||
/// CDC cursor row (table 10).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CdcCursorRow {
|
||||
pub sink_name: String,
|
||||
pub index_uid: String,
|
||||
pub last_event_seq: i64,
|
||||
pub updated_at: i64,
|
||||
}
|
||||
|
||||
/// New or updated CDC cursor (table 10).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewCdcCursor {
|
||||
pub sink_name: String,
|
||||
pub index_uid: String,
|
||||
pub last_event_seq: i64,
|
||||
pub updated_at: i64,
|
||||
}
|
||||
|
||||
/// Tenant map row (table 11).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TenantMapRow {
|
||||
pub api_key_hash: Vec<u8>,
|
||||
pub tenant_id: String,
|
||||
pub group_id: Option<i64>,
|
||||
}
|
||||
|
||||
/// New tenant mapping (table 11).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewTenantMapping {
|
||||
pub api_key_hash: Vec<u8>,
|
||||
pub tenant_id: String,
|
||||
pub group_id: Option<i64>,
|
||||
}
|
||||
|
||||
/// Rollover policy row (table 12).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RolloverPolicyRow {
|
||||
pub name: String,
|
||||
pub write_alias: String,
|
||||
pub read_alias: String,
|
||||
pub pattern: String,
|
||||
pub triggers_json: String,
|
||||
pub retention_json: String,
|
||||
pub template_json: String,
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
/// New or updated rollover policy (table 12).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewRolloverPolicy {
|
||||
pub name: String,
|
||||
pub write_alias: String,
|
||||
pub read_alias: String,
|
||||
pub pattern: String,
|
||||
pub triggers_json: String,
|
||||
pub retention_json: String,
|
||||
pub template_json: String,
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
/// Search UI config row (table 13).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SearchUiConfigRow {
|
||||
pub index_uid: String,
|
||||
pub config_json: String,
|
||||
pub updated_at: i64,
|
||||
}
|
||||
|
||||
/// New or updated search UI config (table 13).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewSearchUiConfig {
|
||||
pub index_uid: String,
|
||||
pub config_json: String,
|
||||
pub updated_at: i64,
|
||||
}
|
||||
|
||||
/// Admin session row (table 14).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AdminSessionRow {
|
||||
pub session_id: String,
|
||||
pub csrf_token: String,
|
||||
pub admin_key_hash: String,
|
||||
pub created_at: i64,
|
||||
pub expires_at: i64,
|
||||
pub revoked: bool,
|
||||
pub user_agent: Option<String>,
|
||||
pub source_ip: Option<String>,
|
||||
}
|
||||
|
||||
/// New admin session (table 14).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewAdminSession {
|
||||
pub session_id: String,
|
||||
pub csrf_token: String,
|
||||
pub admin_key_hash: String,
|
||||
pub created_at: i64,
|
||||
pub expires_at: i64,
|
||||
pub user_agent: Option<String>,
|
||||
pub source_ip: Option<String>,
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -189,11 +189,9 @@ where `α` is tuned empirically.
|
|||
|
||||
## Follow-Up Work
|
||||
|
||||
Create follow-up bead for implementing RRF merging:
|
||||
- Modify `merger.rs` to collect ranks instead of scores
|
||||
- Compute RRF score: `Σ 1/(60 + rank)` per document
|
||||
- Sort by RRF score descending
|
||||
- Benchmark same corpus against ground truth
|
||||
**Status**: RRF merging (Option 2) is already implemented in `merger.rs` (`RRF_K = 60`).
|
||||
|
||||
No further action needed for the core score normalization issue. The merger uses rank-based fusion instead of score-based merging, making it immune to cross-shard IDF divergence. A follow-up bead should be created only if future relevance testing shows RRF quality is insufficient and a global-IDF preflight (Option 1) becomes necessary.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -201,11 +199,16 @@ Create follow-up bead for implementing RRF merging:
|
|||
|
||||
The experiment used 10,000 queries, providing narrow confidence intervals:
|
||||
|
||||
- **Overall τ = 0.79 ± 0.01** (95% CI)
|
||||
- **Common-term τ = 0.15 ± 0.02** (95% CI)
|
||||
- **Rare-term τ = 0.94 ± 0.005** (95% CI)
|
||||
| Query Type | Avg τ | 95% CI | n |
|
||||
|------------|-------|--------|---|
|
||||
| **Overall** | **0.7939** | **[0.7873, 0.8006]** | 10,000 |
|
||||
| Common-term | 0.1483 | [0.1336, 0.1630] | 1,500 |
|
||||
| Single-term | 0.8677 | [0.8583, 0.8771] | 2,500 |
|
||||
| Filtered | 0.8719 | [0.8614, 0.8824] | 2,000 |
|
||||
| Rare-term | 0.9387 | [0.9378, 0.9395] | 1,500 |
|
||||
| Multi-term | 0.9584 | [0.9564, 0.9603] | 2,500 |
|
||||
|
||||
Results are statistically significant and reproducible.
|
||||
All confidence intervals are far from the 0.95 pass threshold (except multi-term, which barely exceeds it). Results are statistically significant and reproducible.
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ Range: [-1, 1], where 1 = perfect agreement, 0 = independent, -1 = perfect disag
|
|||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
|
|
@ -163,14 +164,29 @@ def compare_query_sets(
|
|||
below_090 = sum(1 for t in tau_values if t < 0.90)
|
||||
below_080 = sum(1 for t in tau_values if t < 0.80)
|
||||
|
||||
# 95% confidence intervals (normal approximation, n >= 10000)
|
||||
variance = sum((t - avg_tau) ** 2 for t in tau_values) / (len(tau_values) - 1)
|
||||
stddev = math.sqrt(variance)
|
||||
stderr = stddev / math.sqrt(len(tau_values))
|
||||
z = 1.96
|
||||
ci_low = avg_tau - z * stderr
|
||||
ci_high = avg_tau + z * stderr
|
||||
|
||||
# Per-type statistics
|
||||
type_stats = {}
|
||||
for qtype, taus in tau_by_type.items():
|
||||
tn = len(taus)
|
||||
tmean = sum(taus) / tn if taus else 0
|
||||
tvar = sum((t - tmean) ** 2 for t in taus) / (tn - 1) if tn > 1 else 0
|
||||
tsd = math.sqrt(tvar)
|
||||
tse = tsd / math.sqrt(tn) if tn > 0 else 0
|
||||
type_stats[qtype] = {
|
||||
"count": len(taus),
|
||||
"avg_tau": sum(taus) / len(taus) if taus else 0,
|
||||
"count": tn,
|
||||
"avg_tau": tmean,
|
||||
"min_tau": min(taus) if taus else 0,
|
||||
"max_tau": max(taus) if taus else 0,
|
||||
"ci_95": [tmean - z * tse, tmean + z * tse] if tn > 1 else None,
|
||||
"stddev": tsd,
|
||||
}
|
||||
|
||||
return {
|
||||
|
|
@ -178,6 +194,9 @@ def compare_query_sets(
|
|||
"avg_tau": avg_tau,
|
||||
"min_tau": min_tau,
|
||||
"max_tau": max_tau,
|
||||
"ci_95": [ci_low, ci_high],
|
||||
"stddev": stddev,
|
||||
"stderr": stderr,
|
||||
"below_095_count": below_095,
|
||||
"below_090_count": below_090,
|
||||
"below_080_count": below_080,
|
||||
|
|
@ -211,17 +230,19 @@ def main():
|
|||
print(f"Comparison Summary (top-{args.top_k})")
|
||||
print(f"=" * 50)
|
||||
print(f"Total queries: {result['total_queries']}")
|
||||
print(f"Avg Kendall tau: {result['avg_tau']:.4f}")
|
||||
ci = result['ci_95']
|
||||
print(f"Avg Kendall tau: {result['avg_tau']:.4f} (95% CI: [{ci[0]:.4f}, {ci[1]:.4f}])")
|
||||
print(f"Min tau: {result['min_tau']:.4f}")
|
||||
print(f"Max tau: {result['max_tau']:.4f}")
|
||||
print(f"Queries below 0.95: {result['below_095_count']} ({100*result['below_095_count']/result['total_queries']:.1f}%)")
|
||||
print(f"Queries below 0.90: {result['below_090_count']} ({100*result['below_090_count']/result['total_queries']:.1f}%)")
|
||||
print(f"Queries below 0.80: {result['below_080_count']} ({100*result['below_080_count']/result['total_queries']:.1f}%)")
|
||||
print(f"Pass criteria (avg >= 0.95): {'✓ PASS' if result['pass_criteria'] else '✗ FAIL'}")
|
||||
print(f"Pass criteria (avg >= 0.95): {'PASS' if result['pass_criteria'] else 'FAIL'}")
|
||||
|
||||
print(f"\nPer-query type:")
|
||||
for qtype, stats in result["type_stats"].items():
|
||||
print(f" {qtype}: avg={stats['avg_tau']:.4f}, min={stats['min_tau']:.4f}, max={stats['max_tau']:.4f} (n={stats['count']})")
|
||||
ci_str = f", 95% CI: [{stats['ci_95'][0]:.4f}, {stats['ci_95'][1]:.4f}]" if stats.get('ci_95') else ""
|
||||
print(f" {qtype}: avg={stats['avg_tau']:.4f}{ci_str}, min={stats['min_tau']:.4f}, max={stats['max_tau']:.4f} (n={stats['count']})")
|
||||
|
||||
if args.verbose:
|
||||
print(f"\nPer-query details:")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue