From 612e7ce0ea142fd749a0bf6c665a45886087ea6e Mon Sep 17 00:00:00 2001 From: jedarden Date: Sun, 19 Apr 2026 00:20:29 -0400 Subject: [PATCH] P1.5: Implement scatter module with covering-set construction + dispatch trait - Add NodeClient trait for HTTP calls to Meilisearch nodes (seam between pure miroir-core and networked miroir-proxy) - Add ScatterPlan struct containing chosen_group, target_shards, shard_to_node mapping, deadline_ms, hedging_eligible - Implement plan_search_scatter() pure function that constructs the covering set without I/O - Implement execute_scatter() async function that fans out to nodes with partial-failure handling - Add MockNodeClient for testing with pre-programmed responses/errors - Add unit tests for plan construction, query group rotation, shard-to-node mapping, hedging eligibility, and scatter execution Co-Authored-By: Claude Opus 4.7 --- .beads/issues.jsonl | 21 +- .beads/traces/miroir-qon.1/metadata.json | 16 + .beads/traces/miroir-qon.1/stderr.txt | 0 .beads/traces/miroir-qon.1/stdout.txt | 3 + .beads/traces/miroir-qon.2/metadata.json | 16 + .beads/traces/miroir-qon.2/stderr.txt | 0 .beads/traces/miroir-qon.2/stdout.txt | 1 + .beads/traces/miroir-qon.3/metadata.json | 16 + .beads/traces/miroir-qon.3/stderr.txt | 0 .beads/traces/miroir-qon.3/stdout.txt | 1 + .beads/traces/miroir-qon.4/metadata.json | 16 + .beads/traces/miroir-qon.4/stderr.txt | 0 .beads/traces/miroir-qon.4/stdout.txt | 3 + .beads/traces/miroir-qon.5/metadata.json | 16 + .beads/traces/miroir-qon.5/stderr.txt | 0 .beads/traces/miroir-qon.5/stdout.txt | 1 + .beads/traces/miroir-qon.6/metadata.json | 16 + .beads/traces/miroir-qon.6/stderr.txt | 0 .beads/traces/miroir-qon.6/stdout.txt | 1 + .beads/traces/miroir-qon.7/metadata.json | 16 + .beads/traces/miroir-qon.7/stderr.txt | 0 .beads/traces/miroir-qon.7/stdout.txt | 1 + .beads/traces/miroir-zc2.1/metadata.json | 16 + .beads/traces/miroir-zc2.1/stderr.txt | 0 .beads/traces/miroir-zc2.1/stdout.txt | 1 + .beads/traces/miroir-zc2.2/metadata.json | 16 + .beads/traces/miroir-zc2.2/stderr.txt | 0 .beads/traces/miroir-zc2.2/stdout.txt | 3 + .cargo/config.toml | 3 + .needle-predispatch-sha | 2 +- Cargo.lock | 456 ++++++++- Cargo.toml | 1 + crates/miroir-core/Cargo.toml | 2 + crates/miroir-core/src/merger.rs | 865 +++++++++++++++++- crates/miroir-core/src/router.rs | 226 ++++- crates/miroir-core/src/scatter.rs | 579 ++++++++++-- .../router_proptest.proptest-regressions | 11 + tests/benches/score-comparability/simulate.py | 91 +- 38 files changed, 2296 insertions(+), 120 deletions(-) create mode 100644 .beads/traces/miroir-qon.1/metadata.json create mode 100644 .beads/traces/miroir-qon.1/stderr.txt create mode 100644 .beads/traces/miroir-qon.1/stdout.txt create mode 100644 .beads/traces/miroir-qon.2/metadata.json create mode 100644 .beads/traces/miroir-qon.2/stderr.txt create mode 100644 .beads/traces/miroir-qon.2/stdout.txt create mode 100644 .beads/traces/miroir-qon.3/metadata.json create mode 100644 .beads/traces/miroir-qon.3/stderr.txt create mode 100644 .beads/traces/miroir-qon.3/stdout.txt create mode 100644 .beads/traces/miroir-qon.4/metadata.json create mode 100644 .beads/traces/miroir-qon.4/stderr.txt create mode 100644 .beads/traces/miroir-qon.4/stdout.txt create mode 100644 .beads/traces/miroir-qon.5/metadata.json create mode 100644 .beads/traces/miroir-qon.5/stderr.txt create mode 100644 .beads/traces/miroir-qon.5/stdout.txt create mode 100644 .beads/traces/miroir-qon.6/metadata.json create mode 100644 .beads/traces/miroir-qon.6/stderr.txt create mode 100644 .beads/traces/miroir-qon.6/stdout.txt create mode 100644 .beads/traces/miroir-qon.7/metadata.json create mode 100644 .beads/traces/miroir-qon.7/stderr.txt create mode 100644 .beads/traces/miroir-qon.7/stdout.txt create mode 100644 .beads/traces/miroir-zc2.1/metadata.json create mode 100644 .beads/traces/miroir-zc2.1/stderr.txt create mode 100644 .beads/traces/miroir-zc2.1/stdout.txt create mode 100644 .beads/traces/miroir-zc2.2/metadata.json create mode 100644 .beads/traces/miroir-zc2.2/stderr.txt create mode 100644 .beads/traces/miroir-zc2.2/stdout.txt create mode 100644 .cargo/config.toml create mode 100644 crates/miroir-core/tests/router_proptest.proptest-regressions diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index fa7338b..c65eda7 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -32,11 +32,11 @@ {"id":"miroir-b64","title":"Genesis: Miroir Implementation","description":"## Genesis Bead\n**Tied to plan:** `/home/coding/miroir/docs/plan/plan.md`\n\n## Project Overview\n\n**Miroir** — _Multi-node Index Replication Orchestrator, Integrated Rebalancing_ — is a RAID-like sharding and high-availability layer for **Meilisearch Community Edition (MIT)**. It stripes a large index across a fleet of Meilisearch nodes, fans out search queries across all shards, merges ranked results, and rebalances shard assignments when nodes are added or removed — all without Meilisearch Enterprise.\n\n## Why This Exists\n\nMeilisearch CE loads its entire index into memory-mapped LMDB files. A large index that exceeds a single server's available RAM cannot run on that server. The Enterprise Edition's native sharding and replication are **BUSL-1.1 gated** — production use requires a commercial license. Miroir solves this using only the Meilisearch **public REST API**, with no node-side patches or forks. Every Meilisearch node continues to run unmodified CE.\n\n## Design Principles (from plan §1)\n\n1. **Invisible federation** — clients talk to one endpoint using the standard Meilisearch API\n2. **No Enterprise dependency** — pure CE (MIT) everywhere\n3. **Rendezvous hashing (HRW)** — matches what Meilisearch Enterprise itself uses internally\n4. **RF-configurable redundancy** — RF=1 capacity, RF=2 one-node-loss, RF=3 two-node-loss\n5. **Graceful degradation** — partial results with `X-Miroir-Degraded` beats whole-request failure\n6. **Static binaries, scratch images** — musl + scratch Docker, trivial deploy, tiny attack surface\n7. **GitOps first** — all config in `jedarden/declarative-config`, ArgoCD drives cluster changes\n8. **Fixed per-pod resource envelope (2 vCPU / 3.75 GB)** — scale out, not up\n\n## Architecture (high-level)\n\n- **Shards (S)** — logical hash-space granularity, **fixed at index creation**, `S = max_nodes_per_group_ever × 8`\n- **Replica Groups (RG)** — independent query pools, each holds a full copy of all shards; scales **read throughput**\n- **Replication Factor (RF)** — intra-group copies per shard; scales **HA within a group**\n- **Writes** fan out to `RG × RF` nodes (one per-group quorum, cluster-wide success when ≥1 group met its quorum)\n- **Reads** target exactly one group per query (round-robin); fan out to that group's covering set only\n- **Rendezvous hashing is scoped to each group** — prevents cross-group coverage gaps\n\n## Phase Plan\n\n- [ ] **Phase 0 — Foundation** — Cargo workspace, crate layout, config schema, dependencies\n- [ ] **Phase 1 — Core Routing** (plan §2, §4) — rendezvous hash, topology, write targets, covering set\n- [ ] **Phase 2 — Proxy + API Surface** (plan §3, §5) — HTTP server, documents/search/indexes/settings/tasks/health, result merger, quorum, error mapping\n- [ ] **Phase 3 — Task Registry + Persistence** (plan §4 task store) — SQLite schema (14 tables), Redis mirror for HA\n- [ ] **Phase 4 — Topology Operations** (plan §2 topology changes, §4 rebalancer) — add/remove node, add/remove group, drain, dual-write, shard-filter migration\n- [ ] **Phase 5 — Advanced Capabilities** (plan §13, subsections .1–.21) — reshard, hedging, EWMA, query planner, two-phase settings, session pinning, aliases, anti-entropy, streaming dump import, idempotency+coalescing, multi-search, vector, CDC, TTL, tenant affinity, shadow tee, ILM, canaries, Admin UI, Explain, Search UI\n- [ ] **Phase 6 — Horizontal Scaling + HPA** (plan §14) — pod envelope, request-path statelessness, Mode A/B/C background coordination, peer discovery, HPA spec\n- [ ] **Phase 7 — Observability + Ops** (plan §10) — metrics, tracing, logs, alerts, Grafana dashboard, ServiceMonitor\n- [ ] **Phase 8 — Deployment + CI** (plan §6, §7) — Dockerfile (scratch+musl), Helm chart, ArgoCD Application, Argo Workflow template\n- [ ] **Phase 9 — Testing** (plan §8) — unit, integration (docker-compose), compatibility, chaos, performance (criterion), SDK smoke tests\n- [ ] **Phase 10 — Security + Secrets** (plan §9) — sealed secrets, ESO/OpenBao integration, key rotation (admin-scoped, JWT, scoped-key), CSRF posture\n- [ ] **Phase 11 — Onboarding + Docs + Delivered Artifacts** (plan §11, §12) — README, CHANGELOG, migration docs, miroir-ctl help, runbooks, release checklist\n- [ ] **Phase 12 — Open Problems Tracking** (plan §15) — score normalization at scale validation, arm64 support, Raft-based HA task state exploration\n\n## How to use this bead\n\n- Each phase has its own epic bead that blocks this genesis bead\n- Every phase epic decomposes into concrete task beads; most tasks have subtasks\n- Dependencies are wired so ready-work can be discovered with `br ready`\n- Close phase epics as they complete; update the checklist above by editing this bead's body\n- Close this genesis bead only when all phases are complete AND `br ready` returns empty\n\n## Cross-cutting references\n\n- Infrastructure: Hetzner EX44 + Tailscale + iad-ci Argo Workflows (see `/home/coding/CLAUDE.md`)\n- Container registry: `ghcr.io/jedarden/miroir`\n- Helm chart OCI: `ghcr.io/jedarden/charts/miroir`\n- GitHub Pages: `https://jedarden.github.io/miroir`\n- Declarative config repo: `jedarden/declarative-config → k8s/iad-ci/argo-workflows/miroir-ci.yaml`\n- Argo UI: `https://argo-ci.ardenone.com` (VPN+SSO)\n- ArgoCD read-only API: `https://argocd-ro-ardenone-manager-ts.ardenone.com:8444`\n\n## Resources\n\n- Plan doc: `/home/coding/miroir/docs/plan/plan.md` (3739 lines, authoritative)\n- Research: `/home/coding/miroir/docs/research/{ha-approaches,consistent-hashing,distributed-search-patterns}.md`\n- Notes: `/home/coding/miroir/docs/notes/api-compatibility.md`","status":"open","priority":0,"issue_type":"genesis","created_at":"2026-04-18T21:16:57.035422879Z","created_by":"coding","updated_at":"2026-04-18T21:23:03.980674624Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["epic","genesis"],"dependencies":[{"issue_id":"miroir-b64","depends_on_id":"miroir-46p","type":"blocks","created_at":"2026-04-18T21:23:03.914397943Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-89x","type":"blocks","created_at":"2026-04-18T21:23:03.880994818Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-9dj","type":"blocks","created_at":"2026-04-18T21:23:03.707537245Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-afh","type":"blocks","created_at":"2026-04-18T21:23:03.828449381Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-cdo","type":"blocks","created_at":"2026-04-18T21:23:03.693122638Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-m9q","type":"blocks","created_at":"2026-04-18T21:23:03.812940820Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-mkk","type":"blocks","created_at":"2026-04-18T21:23:03.751578908Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-qjt","type":"blocks","created_at":"2026-04-18T21:23:03.851889265Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-qon","type":"blocks","created_at":"2026-04-18T21:23:03.678271938Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-r3j","type":"blocks","created_at":"2026-04-18T21:23:03.725188496Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-uhj","type":"blocks","created_at":"2026-04-18T21:23:03.780275977Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-uyx","type":"blocks","created_at":"2026-04-18T21:23:03.949940719Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-b64","depends_on_id":"miroir-zc2","type":"blocks","created_at":"2026-04-18T21:23:03.980624158Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-cdo","title":"Phase 1 — Core Routing (rendezvous hash, topology, covering set)","description":"## Phase 1 Epic — Core Routing\n\nImplements the deterministic, coordination-free routing primitives that everything else depends on. After this phase, given a fixed topology + config, any Miroir pod can independently compute identical write targets and covering sets — no coordination required.\n\n## Why This Matters\n\nPlan §1 principle 3: rendezvous hashing (HRW) is the same algorithm Meilisearch Enterprise uses internally with twox-hash. Getting this right has **three** properties we rely on downstream:\n\n1. **Determinism** — all pods agree on assignments without any gossip protocol\n2. **Minimal reshuffling** — adding a node to a group moves only ~1/(Ng+1) of that group's docs (plan §2 \"Properties\" bullets)\n3. **Group isolation** — hashing scoped to intra-group node lists prevents both replicas of a shard from landing in the same group (plan §2 \"Why group-scoped assignment matters\")\n\nThese properties are the foundation for the §2 write path, §2 read path, §4 rebalancer, §13.3 adaptive selection, §13.4 query planner, §13.8 anti-entropy, and §14.5 Mode A shard-partitioned ownership. A subtle bug here — e.g., seeding the hash differently, using a non-stable node-id encoding — corrupts every later layer silently.\n\n## Scope (plan §2 Architecture + §4 router.rs)\n\n- `router.rs` — `score(shard, node)`, `assign_shard_in_group`, `write_targets`, `query_group`, `covering_set`, `shard_for_key`\n- `topology.rs` — `Topology` struct (nodes grouped by `replica_group`), node health state machine (healthy / degraded / draining / failed / joining / active / removed)\n- `scatter.rs` — fan-out orchestration primitives (stubbed execution; wired in Phase 2)\n- `merger.rs` — result merge primitives (global sort by `_rankingScore`, offset/limit, facet aggregation, estimatedTotalHits summation, `_miroir_shard` + `_rankingScore` stripping) — pure-function friendly for unit testing\n- Unit tests per §8 \"Router correctness\" + \"Result merger\" bullets\n\n## Definition of Done\n\n- [ ] Rendezvous assignment is deterministic given fixed node list (verified by test)\n- [ ] Adding a 4th node in a 3-node group moves at most ~2 × (1/4) of shards (verified by test, plan §8)\n- [ ] 64 shards / 3 nodes / RF=1 → each node holds 18–26 shards (verified by test)\n- [ ] Top-RF placement changes minimally on add / remove (verified by test)\n- [ ] `write_targets` returns exactly `RG × RF` nodes, one from each group\n- [ ] `query_group(seq, RG)` distributes evenly (verified by test)\n- [ ] `covering_set` within a group returns exactly one node per shard (with intra-group replica rotation)\n- [ ] `merger` passes the merge/facet/limit tests in plan §8\n- [ ] `miroir-core` ≥ 90% line coverage via cargo-tarpaulin (per §8 coverage policy)","status":"open","priority":0,"issue_type":"epic","created_at":"2026-04-18T21:18:33.134146061Z","created_by":"coding","updated_at":"2026-04-18T21:23:08.556807022Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase","phase-1"],"dependencies":[{"issue_id":"miroir-cdo","depends_on_id":"miroir-qon","type":"blocks","created_at":"2026-04-18T21:23:08.556785813Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-cdo.1","title":"P1.1 Rendezvous hash primitives (score, assign_shard_in_group)","description":"## What\n\nImplement `miroir_core::router`:\n```rust\npub fn score(shard_id: u32, node_id: &str) -> u64\npub fn assign_shard_in_group(shard_id: u32, group_nodes: &[NodeId], rf: usize) -> Vec\npub fn shard_for_key(primary_key: &str, shard_count: u32) -> u32\n```\n\n## Why\n\nThese three are the atoms everything else builds on. `score` uses `XxHash64::with_seed(0)` with the canonical concatenation order `(shard_id, node_id)` (plan §4 code sample). Any deviation (different seed, different ordering, endianness) forks routing across any two Miroir instances and silently corrupts writes.\n\n## Design Notes (plan §2 / §4)\n\n- **Hash function is `twox-hash` (XxHash family)** — the same one Meilisearch Enterprise uses; the choice is non-negotiable (plan §2).\n- **Node-id encoding stability** — the string passed to `node_id.hash(&mut h)` must be byte-stable. Use the bare `id: \"meili-0\"` string from config, not a reformatted address.\n- **`assign_shard_in_group` is group-scoped on purpose** — per plan §2 \"Why group-scoped assignment matters\": scoping to the group prevents both replicas of a shard from landing in the same group. A global rendezvous would have no such guarantee.\n- **Sort by score descending, break ties lexicographically on node_id** so two nodes with identical hash scores (extremely rare but possible) deterministically resolve.\n\n## Acceptance Tests (plan §8 \"Router correctness\")\n\n- [ ] Determinism: same `(shard_id, nodes)` → identical `Vec` across 1000 randomized runs\n- [ ] Reshuffle bound on add: 64 shards, 3→4 nodes in a group → at most `2 × (1/4) × 64` shard-node edges differ\n- [ ] Reshuffle bound on remove: 64 shards, 4→3 nodes → `~RF × S / Ng` edges differ\n- [ ] Uniformity: 64 shards, 3 nodes, RF=1 → each node holds 18–26 shards (chi-square not rejected at p=0.95)\n- [ ] RF=2 placement: top-2 nodes change minimally when a node is added or removed\n- [ ] `shard_for_key(pk, S)` is `(XxHash64::with_seed(0).hash(pk) % S)` — verified against a known fixture vector","status":"closed","priority":0,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:26:11.754243556Z","created_by":"coding","updated_at":"2026-04-19T03:47:59.776479292Z","closed_at":"2026-04-19T03:47:59.776362081Z","close_reason":"P1.1 Complete: Fixed shard_for_key fixture test values\n\nThe three rendezvous hash primitives were already implemented:\n- score(shard_id, node_id) using XxHash64::with_seed(0) with canonical order (shard_id, node_id)\n- assign_shard_in_group with lexicographic tie-breaking\n- shard_for_key using direct hash modulo\n\nFixed incorrect fixture values in test:\n- order:xyz → 10 (was 25)\n- alpha → 104 (was 121) \n- beta → 91 (was 93)\n\nAll 8 acceptance tests pass:\n- Determinism ✓\n- Reshuffle bound on add ✓\n- Reshuffle bound on remove ✓\n- Uniformity ✓\n- RF=2 placement stability ✓\n- shard_for_key fixture ✓","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.1","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.754243556Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-cdo.2","title":"P1.2 Topology type + node state machine","description":"## What\n\nImplement `miroir_core::topology`:\n```rust\npub struct Topology {\n pub shards: u32,\n pub replica_groups: u32,\n pub rf: usize,\n pub nodes: Vec,\n}\npub struct Node {\n pub id: NodeId,\n pub address: String,\n pub replica_group: u32,\n pub status: NodeStatus,\n}\npub enum NodeStatus { Healthy, Degraded, Draining, Failed, Joining, Active, Removed }\n```\n\nHelpers: `Topology::groups() -> impl Iterator`, `Topology::group(g: u32) -> &Group`, `group.nodes() -> &[Node]`, `group.healthy_nodes() -> Vec<&Node>`.\n\n## Why\n\nThe `Topology` type is what `router` operates on. State transitions correspond to plan §2 topology-change verbs: a node is `Joining` → `Active` after a group-add migration; `Draining` → `Removed` after a node-remove migration; `Failed` is for unplanned loss.\n\nThe state field matters for **routing-eligibility**: writes skip `Draining` for *affected* shards (plan §2 \"Removing a node\" step 1), but still deliver to it for shards it still owns. A bug where a `Draining` node stops receiving any writes prematurely would create durability gaps during rebalance.\n\n## State Transition Rules\n\n| From | To | Triggered by |\n|------|-----|-------------|\n| (new) | Joining | `POST /_miroir/nodes` (plan §4 admin API) |\n| Joining | Active | Migration complete (Phase 4) |\n| Active | Draining | `POST /_miroir/nodes/{id}/drain` |\n| Draining | Removed | Migration complete (Phase 4) |\n| Active/Draining | Failed | Health check detects (Phase 7) |\n| Failed | Active | Health check recovery + optional replication catch-up |\n| Active/Failed | Degraded | Partial health (timeouts, not full disconnect) |\n| Degraded | Active | Health restored |\n\n## Acceptance\n\n- [ ] Topology deserializes from plan §4 YAML example (RG=2, 6 nodes, RF=1) into the expected shape\n- [ ] `groups()` iterator returns `RG` groups in ascending order; each group holds exactly its configured nodes\n- [ ] State-machine unit tests cover every legal transition and reject illegal ones (e.g., Joining → Draining)\n- [ ] `Node::is_write_eligible_for(shard_id, status)` correctness table has a test per row","status":"in_progress","priority":0,"issue_type":"task","assignee":"charlie","created_at":"2026-04-18T21:26:11.777790379Z","created_by":"coding","updated_at":"2026-04-19T03:56:20.557776143Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.2","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.777790379Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-cdo.3","title":"P1.3 write_targets and covering_set","description":"## What\n\nImplement the two flat API calls used by the HTTP layer:\n```rust\npub fn write_targets(shard_id: u32, topology: &Topology) -> Vec\npub fn query_group(query_seq: u64, replica_groups: u32) -> u32\npub fn covering_set(shard_count: u32, group: &Group, rf: usize, query_seq: u64) -> Vec\n```\n\n## Why / Semantics (plan §2)\n\n**`write_targets`** — flat union of `assign_shard_in_group(shard, g)` across all `RG` groups. Returns `RG × RF` nodes total (may include duplicates across groups if a node_id coincidentally has the highest score in multiple groups — use a dedup pass in the HTTP layer when grouping docs per-request rather than dedup here, so the routing layer's behavior is pure).\n\n**`query_group`** — round-robin per the plan's note: \"`query_sequence_number` is a per-pod counter, not a cluster-wide one.\" Under HPA, cluster-wide balance relies on the K8s Service's round-robin / random kube-proxy policy (§14.4 link).\n\n**`covering_set`** — one node per shard within a group. The intra-group replica selection within each shard rotates by `query_seq % rf` (plan §4 code sample). The returned set is **deduplicated** because one node may own multiple shards in the same group; searching it once captures all its shards (Meilisearch searches all its local docs in a single call).\n\n## Critical Invariant\n\nTwo different Miroir pods, given identical `Topology` + `rf` + `shard_count`, **must** compute the same `write_targets` for any given `shard_id` and the same `covering_set` modulo `query_seq` rotation. This is the property that makes the request path stateless (plan §14.4).\n\n## Acceptance (plan §8)\n\n- [ ] `write_targets` returns exactly `RG × RF` nodes (counting duplicates)\n- [ ] `write_targets` assigns one-per-group: the subset of returned nodes in group g is exactly `assign_shard_in_group(shard, group_g_nodes)`\n- [ ] `covering_set` has `|covering_set| ≤ Ng` and covers all `shard_count` shards within the chosen group\n- [ ] Two instances of `Topology` with identical content produce identical `covering_set` outputs for the same `query_seq`\n- [ ] `query_group` distribution: 10K `query_seq` values `% RG` produce uniformly distributed group choices (chi-square pass)","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:26:11.798428290Z","created_by":"coding","updated_at":"2026-04-18T21:26:21.576980719Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-1"],"dependencies":[{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.798428290Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo.1","type":"blocks","created_at":"2026-04-18T21:26:21.555076342Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo.2","type":"blocks","created_at":"2026-04-18T21:26:21.576939978Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-cdo.2","title":"P1.2 Topology type + node state machine","description":"## What\n\nImplement `miroir_core::topology`:\n```rust\npub struct Topology {\n pub shards: u32,\n pub replica_groups: u32,\n pub rf: usize,\n pub nodes: Vec,\n}\npub struct Node {\n pub id: NodeId,\n pub address: String,\n pub replica_group: u32,\n pub status: NodeStatus,\n}\npub enum NodeStatus { Healthy, Degraded, Draining, Failed, Joining, Active, Removed }\n```\n\nHelpers: `Topology::groups() -> impl Iterator`, `Topology::group(g: u32) -> &Group`, `group.nodes() -> &[Node]`, `group.healthy_nodes() -> Vec<&Node>`.\n\n## Why\n\nThe `Topology` type is what `router` operates on. State transitions correspond to plan §2 topology-change verbs: a node is `Joining` → `Active` after a group-add migration; `Draining` → `Removed` after a node-remove migration; `Failed` is for unplanned loss.\n\nThe state field matters for **routing-eligibility**: writes skip `Draining` for *affected* shards (plan §2 \"Removing a node\" step 1), but still deliver to it for shards it still owns. A bug where a `Draining` node stops receiving any writes prematurely would create durability gaps during rebalance.\n\n## State Transition Rules\n\n| From | To | Triggered by |\n|------|-----|-------------|\n| (new) | Joining | `POST /_miroir/nodes` (plan §4 admin API) |\n| Joining | Active | Migration complete (Phase 4) |\n| Active | Draining | `POST /_miroir/nodes/{id}/drain` |\n| Draining | Removed | Migration complete (Phase 4) |\n| Active/Draining | Failed | Health check detects (Phase 7) |\n| Failed | Active | Health check recovery + optional replication catch-up |\n| Active/Failed | Degraded | Partial health (timeouts, not full disconnect) |\n| Degraded | Active | Health restored |\n\n## Acceptance\n\n- [ ] Topology deserializes from plan §4 YAML example (RG=2, 6 nodes, RF=1) into the expected shape\n- [ ] `groups()` iterator returns `RG` groups in ascending order; each group holds exactly its configured nodes\n- [ ] State-machine unit tests cover every legal transition and reject illegal ones (e.g., Joining → Draining)\n- [ ] `Node::is_write_eligible_for(shard_id, status)` correctness table has a test per row","status":"closed","priority":0,"issue_type":"task","assignee":"delta","created_at":"2026-04-18T21:26:11.777790379Z","created_by":"coding","updated_at":"2026-04-19T04:06:04.329548111Z","closed_at":"2026-04-19T04:06:04.329417610Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.2","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.777790379Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-cdo.3","title":"P1.3 write_targets and covering_set","description":"## What\n\nImplement the two flat API calls used by the HTTP layer:\n```rust\npub fn write_targets(shard_id: u32, topology: &Topology) -> Vec\npub fn query_group(query_seq: u64, replica_groups: u32) -> u32\npub fn covering_set(shard_count: u32, group: &Group, rf: usize, query_seq: u64) -> Vec\n```\n\n## Why / Semantics (plan §2)\n\n**`write_targets`** — flat union of `assign_shard_in_group(shard, g)` across all `RG` groups. Returns `RG × RF` nodes total (may include duplicates across groups if a node_id coincidentally has the highest score in multiple groups — use a dedup pass in the HTTP layer when grouping docs per-request rather than dedup here, so the routing layer's behavior is pure).\n\n**`query_group`** — round-robin per the plan's note: \"`query_sequence_number` is a per-pod counter, not a cluster-wide one.\" Under HPA, cluster-wide balance relies on the K8s Service's round-robin / random kube-proxy policy (§14.4 link).\n\n**`covering_set`** — one node per shard within a group. The intra-group replica selection within each shard rotates by `query_seq % rf` (plan §4 code sample). The returned set is **deduplicated** because one node may own multiple shards in the same group; searching it once captures all its shards (Meilisearch searches all its local docs in a single call).\n\n## Critical Invariant\n\nTwo different Miroir pods, given identical `Topology` + `rf` + `shard_count`, **must** compute the same `write_targets` for any given `shard_id` and the same `covering_set` modulo `query_seq` rotation. This is the property that makes the request path stateless (plan §14.4).\n\n## Acceptance (plan §8)\n\n- [ ] `write_targets` returns exactly `RG × RF` nodes (counting duplicates)\n- [ ] `write_targets` assigns one-per-group: the subset of returned nodes in group g is exactly `assign_shard_in_group(shard, group_g_nodes)`\n- [ ] `covering_set` has `|covering_set| ≤ Ng` and covers all `shard_count` shards within the chosen group\n- [ ] Two instances of `Topology` with identical content produce identical `covering_set` outputs for the same `query_seq`\n- [ ] `query_group` distribution: 10K `query_seq` values `% RG` produce uniformly distributed group choices (chi-square pass)","status":"closed","priority":0,"issue_type":"task","assignee":"delta","created_at":"2026-04-18T21:26:11.798428290Z","created_by":"coding","updated_at":"2026-04-19T04:14:55.689143427Z","closed_at":"2026-04-19T04:14:55.689022605Z","close_reason":"All three functions already implemented in router.rs:\n- write_targets (lines 40-45): flat union of assign_shard_in_group across all RG groups\n- query_group (lines 48-50): round-robin by query_seq % replica_groups \n- covering_set (lines 53-63): deduplicated node set with replica rotation\n\nAll 7 P1.3 acceptance tests pass:\n- write_targets returns RG × RF nodes\n- write_targets assigns one-per-group correctly\n- covering_set covers all shards within chosen group\n- covering_set size ≤ Ng\n- Two identical topologies produce identical covering_set outputs\n- query_group distribution is uniform (chi-square test)\n- covering_set rotates replicas by query_seq","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.798428290Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo.1","type":"blocks","created_at":"2026-04-18T21:26:21.555076342Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.3","depends_on_id":"miroir-cdo.2","type":"blocks","created_at":"2026-04-18T21:26:21.576939978Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-cdo.4","title":"P1.4 Result merger (global sort + offset/limit + facets + stripping)","description":"## What\n\nImplement `miroir_core::merger`:\n```rust\npub struct MergeInput {\n pub shard_hits: Vec, // one per node in covering set\n pub offset: usize,\n pub limit: usize,\n pub client_requested_score: bool,\n pub facets: Option>,\n}\npub fn merge(input: MergeInput) -> MergedSearchResult\n```\n\n## Why\n\nPlan §2 read path step 6 enumerates the exact sequence:\n1. Collect all hits with scores\n2. Sort globally descending by `_rankingScore`\n3. Apply `offset + limit` **after** merge (not per-shard)\n4. Strip `_rankingScore` from each hit if client did not request it\n5. **Always** strip `_miroir_shard` (and other reserved `_miroir_*` fields)\n6. Sum facet counts across shards\n7. Sum `estimatedTotalHits` across shards\n8. `processingTimeMs` = max across covering set\n\nThis must be a pure function — testable without a network — because it will be hit constantly and any non-determinism (e.g., HashMap iteration order affecting facet key ordering) breaks the compatibility suite.\n\n## Design Notes\n\n- Use a binary min-heap of size `offset + limit` to avoid keeping all hits in RAM when fan-out is large\n- Facet merging: `BTreeMap>` (ordered) for stable serialization\n- `estimatedTotalHits` clamp: Meilisearch caps at 1000 per shard by default — confirm whether Miroir should pass through the cap or sum and let the client see a higher number (consistent with Meilisearch single-node behavior: pass through)\n- Tie-breaking: on equal `_rankingScore`, fall back to lexicographic `primary_key` for deterministic ordering\n\n## Score Comparability Caveat (plan §2 read path, §13.5)\n\nScores are comparable across shards **only if** all nodes have identical index settings — enforced by the §13.5 two-phase broadcast. Until Phase 5 lands, assume settings are uniform and flag a warning in `Config::validate` if drift is detected.\n\n## Acceptance (plan §8 \"Result merger\")\n\n- [ ] Global sort by `_rankingScore` descending across shards\n- [ ] `offset + limit` applied **after** merge; test: 50 docs with known scores, pages of 10 reconstruct single limit=50\n- [ ] `_rankingScore` stripped when `client_requested_score=false`\n- [ ] `_miroir_shard` always stripped\n- [ ] Facet counts sum correctly including keys unique to one shard\n- [ ] `estimatedTotalHits` summed across shards\n- [ ] Stable serialization: `merge` on the same input twice produces byte-identical JSON","status":"closed","priority":0,"issue_type":"task","assignee":"charlie","created_at":"2026-04-18T21:26:11.829984535Z","created_by":"coding","updated_at":"2026-04-19T03:47:30.950232784Z","closed_at":"2026-04-19T03:47:30.950122326Z","close_reason":"Implementation complete and all tests passing (13/13). The merger module implements global sort by _rankingScore descending, offset/limit after merge, conditional _rankingScore stripping, always strips _miroir_* fields, facet aggregation, estimatedTotalHits summation, max processingTimeMs, and degraded flag. Pure function with deterministic output.","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.4","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.829984535Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-cdo.5","title":"P1.5 scatter module: covering-set construction + dispatch trait","description":"## What\n\nImplement `miroir_core::scatter` with:\n```rust\npub trait NodeClient { /* HTTP calls to a Meilisearch node */ }\npub fn plan_search_scatter(topology: &Topology, query_seq: u64, rf: usize, shard_count: u32) -> ScatterPlan\npub async fn execute_scatter(plan: ScatterPlan, client: &C, req: SearchRequest) -> Vec\n```\n\n## Why\n\n`NodeClient` is the seam between `miroir-core` (pure, no network) and `miroir-proxy` (HTTP client). Injecting it via a trait means unit tests can provide a fake client; production binds `reqwest` via the trait impl in `miroir-proxy`.\n\n`plan_search_scatter` returns the exact shard→node mapping that Phase 2 hands to `execute_scatter`. Separating the plan from execution is what makes §13.20 `/explain` cheap — the explain path generates the plan and returns it without touching any node.\n\n## Plan Structure\n\n```rust\npub struct ScatterPlan {\n pub chosen_group: u32, // query_seq % RG\n pub target_shards: Vec, // for §13.4 narrowing — initially all 0..S\n pub shard_to_node: HashMap, // resolved covering set\n pub deadline_ms: u32,\n pub hedging_eligible: bool, // reserved for §13.2 Phase 5\n}\n```\n\n## Acceptance\n\n- [ ] Plan construction is pure — no async, no I/O\n- [ ] `execute_scatter` with a mock `NodeClient` returns one `ShardHitPage` per node in the plan\n- [ ] Partial-failure handling: a failed node surfaces as `Err` on that shard; `merge` downstream applies `unavailable_shard_policy`\n- [ ] Deadline propagation: when any node exceeds `deadline_ms`, the result includes a partial-response flag","status":"open","priority":1,"issue_type":"task","created_at":"2026-04-18T21:26:11.849030740Z","created_by":"coding","updated_at":"2026-04-18T21:26:21.594768429Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-1"],"dependencies":[{"issue_id":"miroir-cdo.5","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.849030740Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.5","depends_on_id":"miroir-cdo.3","type":"blocks","created_at":"2026-04-18T21:26:21.594739255Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-cdo.6","title":"P1.6 Property + benchmark tests for router (criterion + proptest)","description":"## What\n\n- `proptest`-based property tests for rendezvous: determinism, minimal reshuffling bounds, uniformity at various (S, Ng, RF) sizes\n- `criterion` benchmarks targeting the plan §8 goals:\n - Rendezvous assignment (64 shards, 3 nodes, 10K docs) < 1 ms total\n - Merger (1000 hits, 3 shards) < 1 ms\n\n## Why\n\nPlan §8 sets both as gates (\"A PR that increases measured search latency by > 20% over the previous release triggers a review comment\"). Having them live from Phase 1 means regression prevention starts with the first router change.\n\n## Details\n\n- Benches go in `crates/miroir-core/benches/`\n- Property tests go in `crates/miroir-core/tests/` or as `#[cfg(test)]` modules with `proptest!` macros\n- Use a `HashSet` diff to measure reshuffling; assert `|diff| <= 2 * ceil(S / (N+1))` for a node-add event\n\n## Acceptance\n\n- [ ] `cargo bench -p miroir-core` runs all criterion benches and reports timing\n- [ ] `cargo test -p miroir-core` runs property tests with 1024 cases per property (default proptest config)\n- [ ] Phase 8 CI includes `cargo bench --no-run` to compile benches on every build","status":"in_progress","priority":1,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:26:11.875805587Z","created_by":"coding","updated_at":"2026-04-19T03:54:30.405832928Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.875805587Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo.1","type":"blocks","created_at":"2026-04-18T21:26:21.615386498Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo.4","type":"blocks","created_at":"2026-04-18T21:26:21.629878965Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-cdo.5","title":"P1.5 scatter module: covering-set construction + dispatch trait","description":"## What\n\nImplement `miroir_core::scatter` with:\n```rust\npub trait NodeClient { /* HTTP calls to a Meilisearch node */ }\npub fn plan_search_scatter(topology: &Topology, query_seq: u64, rf: usize, shard_count: u32) -> ScatterPlan\npub async fn execute_scatter(plan: ScatterPlan, client: &C, req: SearchRequest) -> Vec\n```\n\n## Why\n\n`NodeClient` is the seam between `miroir-core` (pure, no network) and `miroir-proxy` (HTTP client). Injecting it via a trait means unit tests can provide a fake client; production binds `reqwest` via the trait impl in `miroir-proxy`.\n\n`plan_search_scatter` returns the exact shard→node mapping that Phase 2 hands to `execute_scatter`. Separating the plan from execution is what makes §13.20 `/explain` cheap — the explain path generates the plan and returns it without touching any node.\n\n## Plan Structure\n\n```rust\npub struct ScatterPlan {\n pub chosen_group: u32, // query_seq % RG\n pub target_shards: Vec, // for §13.4 narrowing — initially all 0..S\n pub shard_to_node: HashMap, // resolved covering set\n pub deadline_ms: u32,\n pub hedging_eligible: bool, // reserved for §13.2 Phase 5\n}\n```\n\n## Acceptance\n\n- [ ] Plan construction is pure — no async, no I/O\n- [ ] `execute_scatter` with a mock `NodeClient` returns one `ShardHitPage` per node in the plan\n- [ ] Partial-failure handling: a failed node surfaces as `Err` on that shard; `merge` downstream applies `unavailable_shard_policy`\n- [ ] Deadline propagation: when any node exceeds `deadline_ms`, the result includes a partial-response flag","status":"in_progress","priority":1,"issue_type":"task","assignee":"delta","created_at":"2026-04-18T21:26:11.849030740Z","created_by":"coding","updated_at":"2026-04-19T04:15:02.029925750Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-1"],"dependencies":[{"issue_id":"miroir-cdo.5","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.849030740Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.5","depends_on_id":"miroir-cdo.3","type":"blocks","created_at":"2026-04-18T21:26:21.594739255Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-cdo.6","title":"P1.6 Property + benchmark tests for router (criterion + proptest)","description":"## What\n\n- `proptest`-based property tests for rendezvous: determinism, minimal reshuffling bounds, uniformity at various (S, Ng, RF) sizes\n- `criterion` benchmarks targeting the plan §8 goals:\n - Rendezvous assignment (64 shards, 3 nodes, 10K docs) < 1 ms total\n - Merger (1000 hits, 3 shards) < 1 ms\n\n## Why\n\nPlan §8 sets both as gates (\"A PR that increases measured search latency by > 20% over the previous release triggers a review comment\"). Having them live from Phase 1 means regression prevention starts with the first router change.\n\n## Details\n\n- Benches go in `crates/miroir-core/benches/`\n- Property tests go in `crates/miroir-core/tests/` or as `#[cfg(test)]` modules with `proptest!` macros\n- Use a `HashSet` diff to measure reshuffling; assert `|diff| <= 2 * ceil(S / (N+1))` for a node-add event\n\n## Acceptance\n\n- [ ] `cargo bench -p miroir-core` runs all criterion benches and reports timing\n- [ ] `cargo test -p miroir-core` runs property tests with 1024 cases per property (default proptest config)\n- [ ] Phase 8 CI includes `cargo bench --no-run` to compile benches on every build","status":"closed","priority":1,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:26:11.875805587Z","created_by":"coding","updated_at":"2026-04-19T03:59:44.913619571Z","closed_at":"2026-04-19T03:59:44.913255536Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-1"],"dependencies":[{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo","type":"parent-child","created_at":"2026-04-18T21:26:11.875805587Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo.1","type":"blocks","created_at":"2026-04-18T21:26:21.615386498Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-cdo.6","depends_on_id":"miroir-cdo.4","type":"blocks","created_at":"2026-04-18T21:26:21.629878965Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-m9q","title":"Phase 6 — Horizontal Scaling + HPA (§14)","description":"## Phase 6 Epic — Horizontal Scaling + HPA\n\nDelivers the §14 promise: **fixed per-pod envelope (2 vCPU / 3.75 GB), scale out never up**. Makes the request path strictly stateless and partitions background work across pods via one of three coordination modes.\n\n## Why This Is A Phase\n\nPlan §1 principle 8 + plan §14 are the architectural spine. Phase 2's proxy already runs on one pod; this phase makes N pods coherent. Every §13 feature's \"Scaling mode\" column in plan §14.6 gets wired up here — Phase 5's implementations have to already understand they'll run inside one of the three modes.\n\n## Scope\n\n**14.1–14.3 — Per-pod envelope**\n- `resources.requests` = 500m / 1Gi; `resources.limits` = 2000m / 3584Mi\n- Per-feature memory row validated against plan §14.2 budget\n- CPU budget per plan §14.3 (~3 kQPS/pod small responses)\n\n**14.4 — Request path HPA**\n- `autoscaling/v2` HPA on CPU 70%, memory 75%, `miroir_requests_in_flight` as `type: Pods` `AverageValue: 500`, `miroir_background_queue_depth` as `type: External` `Value: 10` (plan §14.4 note on metric types)\n- `prometheus-adapter` as a chart prerequisite when HPA is enabled\n- `values.schema.json` rejects `hpa.enabled=true` without `replicas >= 2 AND taskStore.backend = redis`\n\n**14.5 — Background coordination modes**\n- **Mode A — Shard-partitioned ownership** (anti-entropy §13.8, settings-drift check §13.5, task registry pruner, TTL sweeper §13.14, canary runner §13.18)\n- **Mode B — Leader-only lease** (reshard coordinator §13.1, rebalancer Phase 4, alias flip serializer §13.7, two-phase settings broadcast §13.5, ILM evaluator §13.17, scoped-key rotation leader §13.21)\n- **Mode C — Work-queued chunked jobs** (streaming dump import §13.9, large reshard backfill §13.1)\n- **Peer discovery** via headless Service (`miroir-headless`) + Downward API `POD_NAME`/`POD_IP`, 15s SRV refresh\n- Rendezvous over peer set for Mode A; `SET NX EX 10` renewed every 3s for Mode B\n- Job lease heartbeat every 10s with 30s timeout for Mode C\n\n**14.6 — Per-feature scaling-mode wiring** — 21 rows, each must compile against the chosen mode\n\n**14.7 — Deployment sizing matrix** — ops documentation/tooling surfacing orchestrator pod count vs. corpus × QPS tiers\n\n**14.8 — Resource-aware defaults** — every config knob's default sized for the envelope\n\n**14.9 — Resource-pressure metrics + alerts** — `miroir_memory_pressure`, `miroir_cpu_throttled_seconds_total`, `miroir_request_queue_depth`, `miroir_background_queue_depth{job_type}`, `miroir_peer_pod_count`, `miroir_leader`, `miroir_owned_shards_count`; PrometheusRule alerts\n\n**14.10 — Vertical-scaling escape valve** — documented as supported but not recommended; no implementation work, just docs\n\n## Definition of Done\n\n- [ ] Multi-pod deployment (replicas=3) — every pod independently serves requests with identical routing\n- [ ] Kill one of three pods mid-traffic — zero client-visible errors beyond retry budget (plan §8 chaos)\n- [ ] Mode A test: spin up 3 pods, anti-entropy runs exactly once per shard per interval cluster-wide\n- [ ] Mode B test: start 3 pods, exactly one holds the reshard lease at any given instant; killing it promotes another within `lease_ttl_s`\n- [ ] Mode C test: submit a 10GB dump; chunks distribute across 3 pods and HPA reacts to `miroir_background_queue_depth`\n- [ ] All §14.2 memory rows fit within 3584 MiB under realistic steady-state load\n- [ ] All §14.9 alerts present in the PrometheusRule manifest and trip under induced fault","status":"open","priority":0,"issue_type":"epic","created_at":"2026-04-18T21:21:13.549727274Z","created_by":"coding","updated_at":"2026-04-18T21:23:08.657411091Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase","phase-6"],"dependencies":[{"issue_id":"miroir-m9q","depends_on_id":"miroir-mkk","type":"blocks","created_at":"2026-04-18T21:23:08.657393466Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-m9q","depends_on_id":"miroir-r3j","type":"blocks","created_at":"2026-04-18T21:23:08.646285774Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-m9q.1","title":"P6.1 Pod resource envelope + limits/requests","description":"## What\n\nImplement pod sizing per plan §14.1 + §14.2 + §14.8:\n- Helm `deployment.yaml` sets `resources.requests = {cpu: 500m, memory: 1Gi}`\n- `resources.limits = {cpu: 2000m, memory: 3584Mi}` (plan §14.8: \"leaves headroom under 3.75 GB node limit\")\n- Config defaults sized for the envelope (§14.8 full YAML)\n\n## Why\n\nPlan §1 principle 8: \"Fixed per-pod resource envelope (2 vCPU / 3.75 GB). When aggregate workload exceeds this envelope, scale **horizontally** by adding pods, never vertically beyond the envelope.\"\n\nWithout enforced limits, a runaway per-feature cache (e.g., session_pinning.max_sessions set unreasonably high) can push a pod into OOM-kill territory, inviting HPA to spin up replacements instead of surfacing the misconfiguration.\n\n## Details\n\n**Per-feature memory rows** (plan §14.2) each need their defaults:\n\n| Component | Budget | Knob |\n|-----------|--------|------|\n| Runtime + axum | 80 MB | — |\n| HTTP/2 pools | 50 MB | `connection_pool_per_node` |\n| Req/resp buffers | 200 MB | `server.max_body_bytes`, `max_concurrent_requests` |\n| Task registry | 100 MB | `task_registry.cache_size` |\n| Idempotency | 100 MB | `idempotency.max_cached_keys` |\n| Sessions | 50 MB | `session_pinning.max_sessions` |\n| Coalescing | 50 MB | `query_coalescing.max_subscribers` |\n| Router + EWMA | 20 MB | fixed |\n| Plan cache | 20 MB | fixed |\n| Alias table | 10 MB | fixed |\n| Metrics | 50 MB | fixed |\n| Dump import buffer | 128 MB | `dump_import.memory_buffer_bytes` (only during import) |\n| Anti-entropy | 128 MB | `anti_entropy.max_read_concurrency` (only during pass) |\n| Multi-search scratch | 5 MB | `multi_search.max_queries_per_batch` |\n| Vector over-fetch | 30 MB | `vector_search.over_fetch_factor` |\n| CDC buffer | 64 MB | `cdc.buffer.memory_bytes` |\n| TTL cursor | 5 MB | — |\n| Tenant map LRU | 20 MB | `tenant_affinity.mode` |\n| Shadow tee | ~50 MB | `shadow.targets[].sample_rate` |\n| Canary state | 20 MB | `canary_runner.run_history_per_canary` |\n| Admin UI assets | 10 MB | fixed |\n| Explain cache | 10 MB | fixed |\n| Search UI assets | 10 MB | fixed |\n| Search UI rate limiter | 20 MB (Redis-backed) | — |\n| Allocator overhead | 800 MB | — |\n| **Steady-state total** | **~1.2 GB** | |\n\n**Regression budget**: add a CI check (Phase 9) that flags when steady-state under synthetic load exceeds 1.7 GB.\n\n## Acceptance\n\n- [ ] Helm rendered manifest matches the requests/limits above\n- [ ] Idle pod < 300 MB RSS on a 3-node cluster\n- [ ] Steady-state (1 kQPS across 3 Miroir pods) under 1.2 GB per pod\n- [ ] One heavy background job (dump import) adds < 500 MB to that pod's total","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:40:30.562386308Z","created_by":"coding","updated_at":"2026-04-18T21:40:30.562386308Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-6"],"dependencies":[{"issue_id":"miroir-m9q.1","depends_on_id":"miroir-m9q","type":"parent-child","created_at":"2026-04-18T21:40:30.562386308Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-m9q.2","title":"P6.2 Peer discovery via headless Service + Downward API","description":"## What\n\nImplement peer discovery per plan §14.5:\n- Helm `miroir-headless.yaml` — a headless Service with label selector on the Deployment\n- Deployment: Downward API injects `POD_NAME` + `POD_IP` as env vars\n- Each pod refreshes peer set every `peer_discovery.refresh_interval_s` (default 15s) via SRV lookup against `miroir-headless..svc.cluster.local`\n- Peer set is `Vec` where `PeerId = POD_NAME` — used by rendezvous for Mode A ownership\n\n## Why\n\nPlan §14.5: \"All three modes rely on the current peer set.\" Mode A rendezvous partitions by peer × work-item; Mode B leader election picks one peer; Mode C claim lease is by peer. Without a peer set, we'd need either a central registry (new dependency) or K8s API calls (requires RBAC + API server load).\n\nSRV-based discovery is zero-config — if headless Service exists, it just works.\n\n## Details\n\n**Manifest** (plan §14.5 + §6):\n```yaml\napiVersion: v1\nkind: Service\nmetadata:\n name: miroir-headless\nspec:\n clusterIP: None\n selector:\n app.kubernetes.io/name: miroir\n ports: [...]\n```\n\n**Env injection** (plan §14.5 \"Peer discovery\"):\n```yaml\nenv:\n- name: POD_NAME\n valueFrom: { fieldRef: { fieldPath: metadata.name } }\n- name: POD_IP\n valueFrom: { fieldRef: { fieldPath: status.podIP } }\n```\n\n**Rust side**:\n```rust\npub struct PeerSet { pub peers: Vec, pub refreshed_at: Instant }\npub async fn refresh_peers(service: &str) -> PeerSet { /* SRV lookup */ }\n```\n\n**Transient double-work** is acceptable (plan §14.5): \"15-second discovery window is harmless: anti-entropy is idempotent, settings-repair is idempotent.\"\n\n## Acceptance\n\n- [ ] 3-pod deployment: each pod sees all 3 peer names within 30s of last pod ready\n- [ ] Scale 3→5: new peers discovered within `refresh_interval_s × 2`\n- [ ] Pod eviction: crashed pod drops from peer set within `refresh_interval_s × 2`\n- [ ] `miroir_peer_pod_count` gauge matches `kube_deployment_status_replicas_ready`","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:40:30.582753605Z","created_by":"coding","updated_at":"2026-04-18T21:40:30.582753605Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-6"],"dependencies":[{"issue_id":"miroir-m9q.2","depends_on_id":"miroir-m9q","type":"parent-child","created_at":"2026-04-18T21:40:30.582753605Z","created_by":"coding","metadata":"{}","thread_id":""}]} @@ -52,7 +52,7 @@ {"id":"miroir-mkk.4","title":"P4.4 Replica group addition: initializing → active","description":"## What\n\nImplement the \"Adding a new replica group\" flow from plan §2:\n1. Provision new nodes; assign `replica_group: G_new` in config\n2. Mark new group `initializing`; queries NOT routed here\n3. Background sync: for each shard, copy all docs from **any** healthy existing group to the new group's nodes via `filter=_miroir_shard={id}` pagination; new inbound writes already fan out to the new group immediately\n4. When all shards synced, mark group `active` — queries begin routing in round-robin\n5. Existing groups continue serving queries throughout (zero read interruption)\n\n## Why\n\nPlan §2 \"Adding a new replica group (throughput scaling)\": adding a group multiplies query capacity without touching existing groups' data. This is the primary \"we need more search QPS\" lever. Unlike intra-group rebalance which moves a subset, group-add **copies** every shard to the new group — so the I/O is proportional to total corpus size, not `1/(Ng+1)`.\n\n## Details\n\n**Source group selection**: round-robin across existing `active` groups to spread read load during sync. Per-shard picks a different source so one group isn't hammered.\n\n**Write fan-out during sync**: new group already receives writes from step 3 onward. This is the durability guarantee — only the backfill window of historical data is transient.\n\n**Progress tracking**: per-shard cursor in `jobs` table; can be paused/resumed per Phase 6 Mode C.\n\n**Verification before `active`**: `GET /indexes/{uid}/stats` against new group → docs count within 0.1% of source group (allows for writes landing during sync). If higher variance, delay the flip and investigate.\n\n## Acceptance\n\n- [ ] Integration test: RG=1 → RG=2; during sync, query throughput on original group unchanged (no regression)\n- [ ] After `active`, queries distribute round-robin between the two groups (verified via per-group metrics)\n- [ ] Mid-sync write test: 100 writes landing during the backfill window are all present on both groups when sync completes\n- [ ] Failed sync (source group becomes unavailable mid-copy) pauses without corrupting new group; resumes when source returns","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:31:43.859158013Z","created_by":"coding","updated_at":"2026-04-18T21:31:48.961616587Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-4"],"dependencies":[{"issue_id":"miroir-mkk.4","depends_on_id":"miroir-mkk","type":"parent-child","created_at":"2026-04-18T21:31:43.859158013Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-mkk.4","depends_on_id":"miroir-mkk.1","type":"blocks","created_at":"2026-04-18T21:31:48.961576914Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-mkk.5","title":"P4.5 Group removal + unplanned node failure","description":"## What\n\nTwo related flows from plan §2:\n\n**Removing a replica group** (decommission a query pool):\n1. Mark group `draining` — queries stop routing immediately\n2. Nodes can be decommissioned; no data migration needed (other groups hold the docs)\n3. Remove nodes from config; operator deletes pods + PVCs\n\n**Unplanned node failure**:\n1. Health check detects failure → mark `failed`, stop routing writes to it\n2. If RF > 1 within the group: surviving replicas serve reads — no immediate migration\n3. For reads: if failed node's shards have no intra-group RF replica, fall back to a healthy group for those shards\n4. Schedule background replication to restore RF within the group; degrade to cross-group fallback until restored\n\n## Why\n\nPlan §2: \"Changes to one group do not affect other groups' data or query routing.\" Group-removal is instant (no data movement) — lets operators shed throughput capacity without a migration window. Unplanned node failure is the most time-sensitive case: readers must not see errors; RF-restore runs in the background.\n\n## Details\n\n**Group-removal preconditions**: refuse to remove a group if it's the last group holding a shard (would be data loss). Require `--force` and document the risk.\n\n**Failure detection**: plan §4 config:\n```yaml\nhealth:\n interval_ms: 5000\n timeout_ms: 2000\n unhealthy_threshold: 3 # 3 consecutive failures → mark degraded\n recovery_threshold: 2 # 2 consecutive OKs → mark healthy again\n```\n\n**Cross-group fallback**: Phase 1 `covering_set` already deterministic per-request; the fallback is a per-shard \"if intra-group has none, check other groups\" decision **inside** the scatter planner (Phase 2).\n\n**RF-restore**: similar to P4.2 node addition but for an existing node that lost its data — re-run `_miroir_shard` filter migration from the best intra-group source.\n\n## Acceptance\n\n- [ ] Remove a group with healthy peer groups → queries route away within one `query_seq` tick; no read errors\n- [ ] `--force`-remove the last group holding shard S → loud warning; operator must re-type the index UID to confirm\n- [ ] RF=2 group with 1 node killed → reads succeed on remaining replica; `X-Miroir-Degraded` absent\n- [ ] RF=1 group with 1 node killed → cross-group fallback kicks in; `X-Miroir-Degraded` absent if fallback succeeds\n- [ ] Restored node re-hydrates from a peer replica within its group; `miroir_rebalance_in_progress` transitions 0→1→0","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:31:43.887649468Z","created_by":"coding","updated_at":"2026-04-18T21:31:48.981354074Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-4"],"dependencies":[{"issue_id":"miroir-mkk.5","depends_on_id":"miroir-mkk","type":"parent-child","created_at":"2026-04-18T21:31:43.887649468Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-mkk.5","depends_on_id":"miroir-mkk.1","type":"blocks","created_at":"2026-04-18T21:31:48.981335608Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-mkk.6","title":"P4.6 Admin API for topology ops: /_miroir/nodes + /_miroir/rebalance","description":"## What\n\nPlan §4 admin API endpoints for topology (wrap the rebalancer flows):\n- `POST /_miroir/nodes` — add node (P4.2)\n- `DELETE /_miroir/nodes/{id}` — drain + remove\n- `POST /_miroir/nodes/{id}/drain` — drain only (P4.3, plan §6 \"Scaling\" scale-down)\n- `POST /_miroir/rebalance` — manually trigger rebalance (e.g., after config-only topology tweak)\n- `GET /_miroir/rebalance/status` — current progress; returned shape includes per-shard phase + `miroir_task_id` for each migration batch\n\n## Why\n\nThese endpoints are the **operator surface**. Everything in §11 \"Common operations with miroir-ctl\" maps to these; the Admin UI §13.19 topology tab is a visual wrapper around the same endpoints. Keeping them REST-shaped rather than ad-hoc makes `miroir-ctl` a thin wrapper and the Admin UI trivial.\n\n## Details\n\n**Body shape for `POST /_miroir/nodes`**:\n```json\n{\n \"id\": \"meili-4\",\n \"address\": \"http://meili-4.search.svc:7700\",\n \"replica_group\": 0\n}\n```\n\n**Response**: `202 Accepted` with a `miroir_task_id` (the rebalance is async). Client polls `/tasks/{mtask}` for terminal status.\n\n**`GET /_miroir/rebalance/status`** returns:\n```json\n{\n \"in_progress\": true,\n \"triggered_by\": \"POST /_miroir/nodes\",\n \"operation_id\": \"reb-1234\",\n \"started_at\": \"2026-04-18T20:00:00Z\",\n \"phases\": [\n {\"shard\": 12, \"state\": \"MigrationInProgress\", \"pct_complete\": 42, \"source\": \"meili-0\", \"destination\": \"meili-4\"},\n ...\n ],\n \"overall_pct_complete\": 38\n}\n```\n\n**Authentication**: admin-key only (plan §5 bearer dispatch rule 2).\n\n## Acceptance\n\n- [ ] `curl -X POST -H \"Authorization: Bearer $ADMIN_KEY\" .../_miroir/nodes -d '{\"id\":\"meili-4\",\"address\":\"http://...\",\"replica_group\":0}'` returns 202 + miroir_task_id\n- [ ] Invalid `replica_group` (not present in current topology) → 400 with clear message\n- [ ] `POST /_miroir/rebalance` without prior topology change returns 200 and a no-op task (already balanced)\n- [ ] `GET .../rebalance/status` during a rebalance reflects per-shard state in near real time (< 5s staleness)","status":"open","priority":1,"issue_type":"task","created_at":"2026-04-18T21:31:43.916640224Z","created_by":"coding","updated_at":"2026-04-18T21:31:49.023343521Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-4"],"dependencies":[{"issue_id":"miroir-mkk.6","depends_on_id":"miroir-mkk","type":"parent-child","created_at":"2026-04-18T21:31:43.916640224Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-mkk.6","depends_on_id":"miroir-mkk.2","type":"blocks","created_at":"2026-04-18T21:31:48.997646112Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-mkk.6","depends_on_id":"miroir-mkk.3","type":"blocks","created_at":"2026-04-18T21:31:49.023268953Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-nsu","title":"RRF Merging Implementation","description":"## Genesis Bead\nTied to plan: /home/coding/miroir/docs/plan/plan.md\n\n## Overview\nImplement Reciprocal Rank Fusion (RRF) for result merging in Miroir to address cross-shard score comparability issues identified in score-normalization-at-scale research.\n\n## Research Context\nExperiments (miroir-zc2.4) showed:\n- Average Kendall tau: 0.79 vs. 0.95 threshold (FAIL)\n- Common-term queries: τ = 0.15 (catastrophic)\n- RRF is the recommended solution (no preflight, production-proven)\n\n## Progress\n- [ ] Phase 1: Update Merger trait and stub\n- [ ] Phase 2: Implement RRF scoring\n- [ ] Phase 3: Benchmark against corpus\n- [ ] Phase 4: Integration with scatter-gather","status":"closed","priority":2,"issue_type":"genesis","assignee":"delta","created_at":"2026-04-19T03:56:08.747340056Z","created_by":"coding","updated_at":"2026-04-19T03:57:28.798234164Z","closed_at":"2026-04-19T03:57:28.798174928Z","close_reason":"Phase 1 implementation pending - see miroir-zc2.4 research for context","source_repo":".","compaction_level":0,"original_size":0} +{"id":"miroir-nsu","title":"RRF Merging Implementation","description":"## Genesis Bead\nTied to plan: /home/coding/miroir/docs/plan/plan.md\n\n## Overview\nImplement Reciprocal Rank Fusion (RRF) for result merging in Miroir to address cross-shard score comparability issues identified in score-normalization-at-scale research.\n\n## Research Context\nExperiments (miroir-zc2.4) showed:\n- Average Kendall tau: 0.79 vs. 0.95 threshold (FAIL)\n- Common-term queries: τ = 0.15 (catastrophic)\n- RRF is the recommended solution (no preflight, production-proven)\n\n## Progress\n- [ ] Phase 1: Update Merger trait and stub\n- [ ] Phase 2: Implement RRF scoring\n- [ ] Phase 3: Benchmark against corpus\n- [ ] Phase 4: Integration with scatter-gather","status":"in_progress","priority":2,"issue_type":"genesis","assignee":"charlie","created_at":"2026-04-19T03:56:08.747340056Z","created_by":"coding","updated_at":"2026-04-19T04:05:00.125829432Z","close_reason":"Phase 1 implementation pending - see miroir-zc2.4 research for context","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1"]} {"id":"miroir-qjt","title":"Phase 8 — Deployment + CI (§6, §7)","description":"## Phase 8 Epic — Deployment + CI\n\nPackages Miroir: static musl binary → scratch Docker image → Helm chart → ArgoCD Application → Argo Workflows CI template (iad-ci). At phase end, `git tag v0.1.0 && git push origin v0.1.0` produces a signed GitHub Release with both `miroir-proxy` and `miroir-ctl`, a ghcr.io image, and a chart version bump.\n\n## Why This Phase (and Why It Depends On Phase 2)\n\nPlan §6 (Deployment) + §7 (CI/CD) turn the binary into a thing operators can actually install. Helm defaults (plan §6 \"Dev vs. production defaults\") encode the \"single-pod dev, multi-pod prod\" story from Phase 6. ArgoCD app + Argo Workflow template live in `jedarden/declarative-config` (see `/home/coding/CLAUDE.md`) — standard pattern across the fleet.\n\n## Scope\n\n**Dockerfile** (plan §7)\n- `FROM scratch` + static `miroir-proxy` binary\n- Expose 7700 + 9090\n- OCI labels: source, version, revision, licenses=MIT\n- Target size < 15 MB compressed\n\n**Cargo musl build** — `x86_64-unknown-linux-musl` target; `cargo build --release` for both `-p miroir-proxy` and `-p miroir-ctl`\n\n**Argo WorkflowTemplate `miroir-ci`** (plan §7) at `jedarden/declarative-config → k8s/iad-ci/argo-workflows/miroir-ci.yaml`\n- DAG: checkout → lint → test → build-binary → docker-build (tag-gated) → github-release (tag-gated)\n- `cargo fmt --check`, `cargo clippy -D warnings`, `cargo test --all`, musl build\n- Kaniko for image push to `ghcr.io/jedarden/miroir:`, `:latest`, `:`, `:`\n- `gh release create` with both binaries + sha256\n\n**Helm chart `charts/miroir/`** (plan §6)\n- Templates: deployment, service, headless, configmap, secret, HPA, optional PVC (CDC), StatefulSet for meilisearch, meilisearch service, optional Redis deployment, serviceaccount\n- `values.yaml` with dev defaults (replicas=1, SQLite, RF=1, RG=1, HPA off)\n- `values.schema.json` that rejects:\n - `miroir.replicas > 1` with `taskStore.backend: sqlite`\n - `miroir.hpa.enabled: true` without `replicas >= 2 && taskStore.backend: redis`\n - `search_ui.rate_limit.backend: local` when `miroir.replicas > 1`\n - Admin login rate-limit local backend in HA\n - `search_ui.scoped_key_rotate_before_expiry_days >= scoped_key_max_age_days`\n- `_helpers.tpl` for fully-qualified StatefulSet DNS node addresses (plan §6 ConfigMap)\n- `NOTES.txt` with next-step pointers\n\n**ArgoCD Application** (plan §6) — `k8s//miroir//` path in `jedarden/declarative-config`, automated sync + prune + selfHeal\n\n**Release mechanics** (plan §7)\n- `CHANGELOG.md` Keep a Changelog format; CI extracts section for GitHub release notes\n- `Cargo.toml` workspace version bumped before tag\n- `Chart.yaml` `appVersion` bumped before tag\n- Tag format: `v[0-9]+.[0-9]+.[0-9]+*`\n\n## Infrastructure Reference\n\n- Registry: `ghcr.io/jedarden/miroir`\n- Helm chart OCI: `ghcr.io/jedarden/charts/miroir`\n- Pages: `https://jedarden.github.io/miroir`\n- CI secrets on iad-ci: `ghcr-credentials` (argo-workflows/.dockerconfigjson), `github-token` (argo-workflows/token)\n- Argo UI: `https://argo-ci.ardenone.com`\n\n## Definition of Done\n\n- [ ] `kubectl --kubeconfig=$HOME/.kube/iad-ci.kubeconfig apply -f workflow.yaml` completes the full CI pipeline on `main` within ~10 min\n- [ ] Pushing tag `v0.1.0-rc.1` produces a ghcr.io image, a GitHub pre-release, and does NOT update `latest`/float tags\n- [ ] `helm install search charts/miroir --namespace search --wait` stands up a working single-pod cluster\n- [ ] `values.schema.json` rejections tested via `helm lint --strict` with mutating values files\n- [ ] Final image ≤ 15 MB compressed\n- [ ] ArgoCD app syncs cleanly against ardenone-manager read-only proxy","status":"open","priority":0,"issue_type":"epic","created_at":"2026-04-18T21:21:13.608558775Z","created_by":"coding","updated_at":"2026-04-18T21:23:08.690462028Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase","phase-8"],"dependencies":[{"issue_id":"miroir-qjt","depends_on_id":"miroir-9dj","type":"blocks","created_at":"2026-04-18T21:23:08.690406249Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-qjt.1","title":"P8.1 Dockerfile: scratch + static musl miroir-proxy","description":"## What\n\nShip the `Dockerfile` from plan §7:\n```dockerfile\nFROM scratch\nCOPY miroir-proxy-linux-amd64 /miroir-proxy\nEXPOSE 7700 9090\nENTRYPOINT [\"/miroir-proxy\"]\nCMD [\"--config\", \"/etc/miroir/config.yaml\"]\n```\n\nOCI labels (plan §12):\n```\norg.opencontainers.image.source=https://github.com/jedarden/miroir\norg.opencontainers.image.version=\norg.opencontainers.image.revision=\norg.opencontainers.image.licenses=MIT\n```\n\nTarget: compressed image < 15 MB.\n\n## Why\n\nPlan §1 principle 6 + §12: \"scratch base, no libc. Zero OS packages, no shell.\" This is the smallest possible attack surface and the fastest possible pull (one layer, tiny). Makes trivial deploys feasible on edge clusters.\n\n## Details\n\n**Musl build step** (plan §7 `cargo-build` template):\n```bash\napt-get install -qy musl-tools\nrustup target add x86_64-unknown-linux-musl\ncargo build --release --target x86_64-unknown-linux-musl -p miroir-proxy\ncargo build --release --target x86_64-unknown-linux-musl -p miroir-ctl\nsha256sum miroir-proxy-linux-amd64 > miroir-proxy-linux-amd64.sha256\n```\n\n**Layers**: COPY the static binary directly from `/workspace/artifacts/` into `/miroir-proxy` in the scratch image.\n\n**Config mount**: `/etc/miroir/config.yaml` via ConfigMap mount (Helm chart).\n\n**No shell = no `docker exec -it` debugging** — intentional. Debug by logs + metrics + `kubectl describe` only. Operators who need shell can run a sidecar.\n\n## Acceptance\n\n- [ ] `docker build .` on an artifact-equipped workspace produces an image < 15 MB compressed\n- [ ] `docker run --help` returns clap help (binary works from scratch base)\n- [ ] Image labels contain all 4 OCI labels with correct values\n- [ ] Static linkage: `ldd` against the extracted binary prints \"not a dynamic executable\"","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:43:56.826575101Z","created_by":"coding","updated_at":"2026-04-18T21:43:56.826575101Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-8"],"dependencies":[{"issue_id":"miroir-qjt.1","depends_on_id":"miroir-qjt","type":"parent-child","created_at":"2026-04-18T21:43:56.826575101Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-qjt.2","title":"P8.2 Helm chart structure + values.yaml dev defaults","description":"## What\n\nScaffold `charts/miroir/` per plan §6:\n```\ncharts/miroir/\n├── Chart.yaml\n├── values.yaml\n├── values.schema.json\n├── templates/\n│ ├── _helpers.tpl\n│ ├── miroir-deployment.yaml\n│ ├── miroir-service.yaml\n│ ├── miroir-headless.yaml\n│ ├── miroir-configmap.yaml\n│ ├── miroir-secret.yaml\n│ ├── miroir-hpa.yaml\n│ ├── miroir-pvc.yaml (optional; rendered only when cdc.buffer.primary=pvc or overflow=pvc)\n│ ├── meilisearch-statefulset.yaml\n│ ├── meilisearch-service.yaml\n│ ├── redis-deployment.yaml (when taskStore.backend=redis)\n│ ├── serviceaccount.yaml\n│ └── NOTES.txt\n└── tests/connection-test.yaml\n```\n\n**values.yaml dev defaults** (plan §6 \"Dev vs. production defaults\"):\n- `miroir.replicas: 1`\n- `miroir.shards: 64`\n- `miroir.replicationFactor: 1`\n- `miroir.replicaGroups: 1`\n- `miroir.hpa.enabled: false`\n- `meilisearch.replicas: 2` (1 group × 2 nodes)\n- `meilisearch.nodesPerGroup: 2`\n- `redis.enabled: false`\n- `taskStore.backend: sqlite`\n\n**Production override guidance**: callout in NOTES.txt pointing at the prod-override values (replicas=2+, RF=2, RG=2, redis+hpa both on).\n\n## Why\n\nPlan §6: \"These defaults boot a working single-pod install for evaluation and CI. For production, override to...\" Clear dev/prod split so a new user can `helm install` and get *something working*, while a production user has a clear upgrade path.\n\n## Details\n\n**Chart.yaml**:\n```yaml\napiVersion: v2\nname: miroir\nversion: 0.1.0\nappVersion: 0.1.0\ndescription: RAID-like sharding and HA for Meilisearch Community Edition\nkeywords: [search, meilisearch, sharding, kubernetes]\nhome: https://github.com/jedarden/miroir\nsources: [https://github.com/jedarden/miroir]\n```\n\n**`_helpers.tpl`** — generates the node list DNS (plan §6 ConfigMap): `http://-meili-.-meili-headless..svc.cluster.local:7700`.\n\n**Chart testing**: `charts/miroir/tests/` with `helm-testing` pod that runs `curl localhost:7700/health`.\n\n## Acceptance\n\n- [ ] `helm lint charts/miroir` passes\n- [ ] `helm install test charts/miroir --dry-run --debug` renders all templates without error\n- [ ] `helm install test charts/miroir --wait` stands up a working single-pod cluster with defaults\n- [ ] `helm test test` passes (the connection test pod curl-succeeds on /health)","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:43:56.872715171Z","created_by":"coding","updated_at":"2026-04-18T21:44:01.416767778Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-8"],"dependencies":[{"issue_id":"miroir-qjt.2","depends_on_id":"miroir-qjt","type":"parent-child","created_at":"2026-04-18T21:43:56.872715171Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-qjt.2","depends_on_id":"miroir-qjt.1","type":"blocks","created_at":"2026-04-18T21:44:01.416733808Z","created_by":"coding","metadata":"{}","thread_id":""}]} @@ -71,11 +71,11 @@ {"id":"miroir-qon.7","title":"P0.7 CI smoke: fmt/clippy/test on push","description":"## What\n\nStand up a minimal CI path — just enough to run `cargo fmt --check`, `cargo clippy -D warnings`, `cargo test --all` — on every push to `main`. This is the earliest viable version of the full `miroir-ci` Argo Workflow template that Phase 8 ships.\n\n## Why\n\nIf CI only lands in Phase 8, Phases 1–7 accumulate quietly-broken code. Plan §7 makes fmt/clippy/test the first three steps of the pipeline on purpose; shipping those now (on iad-ci via a minimal WorkflowTemplate) catches regressions on every commit.\n\n## Details\n\n- Create a stripped-down `miroir-ci-smoke` WorkflowTemplate in `jedarden/declarative-config → k8s/iad-ci/argo-workflows/` that runs only checkout + lint + test\n- Trigger on push to `main` (initially operators kick manually; webhook automation lands in Phase 8)\n- Image: `rust:1.87-slim` to match the full CI template\n- No musl target yet (that's Phase 8); just `cargo test --all`\n\n## Acceptance\n\n- [ ] Manual submit: `kubectl --kubeconfig=$HOME/.kube/iad-ci.kubeconfig create -f - << 1` + `taskStore.backend: sqlite`). Getting the Redis keyspace right now is cheaper than retrofitting.\n\n## Scope — the 14 tables and 14 Redis keyspaces (plan §4)\n\n1. `tasks` — Miroir task registry (miroir_id → node_tasks map + status)\n2. `node_settings_version` — per-(index, node) settings freshness (for §13.5 + `X-Miroir-Min-Settings-Version`)\n3. `aliases` — single-target + multi-target (`kind`, `current_uid`, `target_uids`, `version`, `history`)\n4. `sessions` — read-your-writes session pins (§13.6)\n5. `idempotency_cache` — write dedup (§13.10)\n6. `jobs` — work-queued background jobs (§14.5 Mode C)\n7. `leader_lease` — singleton-coordinator lease (§14.5 Mode B; SQLite advisory lock substitute for single-replica)\n8. `canaries` — canary definitions (§13.18)\n9. `canary_runs` — canary run history (§13.18)\n10. `cdc_cursors` — per-(sink, index) CDC cursor (§13.13)\n11. `tenant_map` — API-key → tenant mapping (§13.15 `api_key` mode)\n12. `rollover_policies` — ILM rollover policies (§13.17)\n13. `search_ui_config` — per-index search-UI config (§13.21)\n14. `admin_sessions` — Admin UI session registry (§13.19)\n\n## Redis keyspace mirror (plan §4 \"Redis mode (HA)\")\n\nEvery table above mapped to a hash + `_index` secondary set so list-wide queries are O(cardinality) without `SCAN`. Plus:\n\n- `miroir:ratelimit:searchui:` (EXPIRE `search_ui.rate_limit.redis_ttl_s`)\n- `miroir:ratelimit:adminlogin:` + `miroir:ratelimit:adminlogin:backoff:` (§13.19, required in HA)\n- `miroir:cdc:overflow:` (1 GiB per sink default)\n- `miroir:search_ui_scoped_key:` + `miroir:search_ui_scoped_key_observed::` (§13.21 rotation coordination)\n- `miroir:admin_session:revoked` Pub/Sub channel for instant logout propagation\n\n## Definition of Done\n\n- [ ] `rusqlite`-backed store initializing every table idempotently at startup\n- [ ] Redis-backed store mirrors the same API (trait `TaskStore` or equivalent), chosen at runtime by `task_store.backend`\n- [ ] Migrations/versioning: schema version recorded in a `schema_version` row so future upgrades detect incompatibility loudly\n- [ ] Property tests: `(insert, get)` round-trip + `(upsert, list)` semantics on SQLite backend\n- [ ] Integration test: restart an orchestrator pod mid-task-poll; task status survives (simulate by opening/closing the SQLite handle between operations)\n- [ ] Redis-backend integration test (`testcontainers` or similar) exercising leases, idempotency dedup, and alias history\n- [ ] `miroir:tasks:_index`-style iteration actually used for list endpoints (no `SCAN`)\n- [ ] `taskStore.backend: redis` + `replicas > 1` enforced by Helm `values.schema.json` (verified with `helm lint`)\n- [ ] Plan §14.7 Redis memory accounting validated against a representative load (bucket count × average size)","status":"open","priority":0,"issue_type":"epic","created_at":"2026-04-18T21:19:53.974489140Z","created_by":"coding","updated_at":"2026-04-18T21:23:08.581853353Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase","phase-3"],"dependencies":[{"issue_id":"miroir-r3j","depends_on_id":"miroir-qon","type":"blocks","created_at":"2026-04-18T21:23:08.581818683Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-r3j.1","title":"P3.1 TaskStore trait + SQLite backend (tables 1-7)","description":"## What\n\nDefine the `TaskStore` trait in `miroir-core` and implement the SQLite backend for the first 7 tables in plan §4 \"Task store schema\":\n\n1. `tasks` — Miroir task registry\n2. `node_settings_version`\n3. `aliases` (both single and multi-target)\n4. `sessions` (read-your-writes pins)\n5. `idempotency_cache`\n6. `jobs`\n7. `leader_lease`\n\n## Why Start Here\n\nThese are the always-present tables — needed even in single-pod dev mode. Tables 8–14 (canaries, cdc_cursors, tenant_map, rollover_policies, search_ui_config, admin_sessions) only instantiate when their respective feature flag is on, so they can land alongside the Phase 5 feature they serve.\n\nDefining the trait **in `miroir-core`** (not `miroir-proxy`) lets the crate be consumed by `miroir-ctl` for diagnostics without pulling in the proxy binary.\n\n## Details\n\nEach table's DDL is already in plan §4 (scroll to the table headers). The trait exposes per-table operations plus a generic `migrate(&self) -> Result<()>` that creates tables idempotently and records a `schema_version` row for upgrade detection.\n\n**Non-obvious**:\n- `tasks.node_tasks` is JSON — use a `serde_json::Value` column, not a stringly-typed hack\n- `aliases.history` is a JSON array bounded by `aliases.history_retention`; enforce bound on `UPDATE`\n- `idempotency_cache.body_sha256` is a `BLOB`, not TEXT — 32 raw bytes\n- `jobs.claim_expires_at` updated by heartbeat every 10s; pod loss → claim expires → another pod picks up\n- `leader_lease` for SQLite is an advisory-lock substitute (persist the row, interpret its presence semantically)\n\n**Idempotent migrations** — use `CREATE TABLE IF NOT EXISTS` + a `schema_versions` table that records each applied migration. Future migrations use `INSERT OR IGNORE` + explicit version gates.\n\n## Acceptance\n\n- [ ] `cargo test -p miroir-core task_store::sqlite` — every CRUD round-trips correctly\n- [ ] Opening an existing DB doesn't re-run migrations; schema version check is a single SELECT\n- [ ] Concurrent writes from two handles (single-process) don't deadlock (WAL mode enabled, `PRAGMA busy_timeout = 5000`)\n- [ ] Table sizes under realistic load fit within plan §14.2 \"Task registry cache 100 MB\" budget","status":"closed","priority":0,"issue_type":"task","assignee":"alpha","created_at":"2026-04-18T21:30:07.264404312Z","created_by":"coding","updated_at":"2026-04-19T03:57:35.791395276Z","closed_at":"2026-04-19T03:57:35.791037019Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.1","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.264404312Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-r3j.2","title":"P3.2 SQLite backend: remaining tables (canaries, cdc_cursors, tenant_map, rollover_policies, search_ui_config, admin_sessions)","description":"## What\n\nExtend the SQLite `TaskStore` with plan §4 tables 8–14:\n8. `canaries` (§13.18)\n9. `canary_runs` (§13.18) — bounded by `canary_runner.run_history_per_canary` (default 100); auto-prune on insert\n10. `cdc_cursors` (§13.13)\n11. `tenant_map` (§13.15 `api_key` mode only)\n12. `rollover_policies` (§13.17)\n13. `search_ui_config` (§13.21)\n14. `admin_sessions` (§13.19) — with `CREATE INDEX admin_sessions_expires ON admin_sessions(expires_at)` for lazy eviction\n\n## Why Separate from P3.1\n\nThese tables are **feature-flag-gated** — `canaries` only instantiates when `canary_runner.enabled`, etc. Keeping them in a separate task lets Phase 5 subsection beads own each table's lifecycle and prevents the ~14-table `CREATE TABLE IF NOT EXISTS` cascade from running for features that will never be used.\n\nThat said, the schema definition itself lives here so every Phase 5 feature can `use` the same typed row structs rather than redefining them ad-hoc.\n\n## Details\n\n**`canary_runs` auto-prune**: on each insert, `DELETE FROM canary_runs WHERE canary_id = ? AND ran_at < (SELECT MIN(ran_at) FROM (SELECT ran_at FROM canary_runs WHERE canary_id = ? ORDER BY ran_at DESC LIMIT N))`. Wrap in a trigger so application code never forgets.\n\n**`admin_sessions.expires_at` index** — plan §4 admin_sessions footnote: rows past expires_at evicted lazily on access AND by Mode A pruner (§14.5). The index makes the scan cheap.\n\n**`cdc_cursors` is a per-(sink, index) composite PK** — both columns must match for update-in-place.\n\n**`tenant_map.api_key_hash` is a 32-byte BLOB** — raw sha256 bytes; never store the plaintext API key.\n\n## Acceptance\n\n- [ ] Every table's typed struct round-trips `insert`/`get` in a unit test\n- [ ] `canary_runs` trigger keeps row count ≤ `run_history_per_canary`\n- [ ] Tables that remain empty when their feature is disabled consume < 16 KB each (SQLite overhead)\n- [ ] Tables are created only when `TaskStore::migrate` is called with the relevant feature flag set (so dev-mode single-pod with all features off creates just 7 tables)","status":"in_progress","priority":0,"issue_type":"task","assignee":"charlie","created_at":"2026-04-18T21:30:07.286925769Z","created_by":"coding","updated_at":"2026-04-19T03:57:48.893794089Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-3"],"dependencies":[{"issue_id":"miroir-r3j.2","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.286925769Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.2","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.179800727Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-r3j.3","title":"P3.3 Redis backend: same trait, Redis keyspace per plan §4","description":"## What\n\nImplement the Redis-backed `TaskStore` mirroring every SQLite table to the keyspace layout in plan §4 \"Redis mode (HA)\":\n\n| SQLite | Redis |\n|--------|-------|\n| `tasks` row | `miroir:tasks:` hash + `miroir:tasks:_index` set |\n| `node_settings_version` | `miroir:node_settings_version::` hash + index set |\n| `aliases` | `miroir:aliases:` hash + index set |\n| `sessions` | `miroir:session:` hash with `EXPIRE session_pinning.ttl_seconds` |\n| `idempotency_cache` | `miroir:idemp:` hash with `EXPIRE idempotency.ttl_seconds` |\n| `jobs` | `miroir:jobs:` hash + `miroir:jobs:_queued` set (HPA signal) |\n| `leader_lease` | `miroir:lease:` string via `SET NX EX 10` renewed every 3s |\n| `canaries` | `miroir:canary:` hash + index set |\n| `canary_runs` | `miroir:canary_runs:` sorted set keyed by `ran_at`; `ZREMRANGEBYRANK` trim |\n| `cdc_cursors` | `miroir:cdc_cursor::` string (integer seq) |\n| `tenant_map` | `miroir:tenant_map:` hash |\n| `rollover_policies` | `miroir:rollover:` hash + index set |\n| `search_ui_config` | `miroir:search_ui_config:` hash |\n| `admin_sessions` | `miroir:admin_session:` hash with `EXPIRE session_ttl_s` + revoked bool |\n\nPlus the extras from plan §4 footnotes:\n- `miroir:search_ui_scoped_key:` hash (fields `primary_uid, previous_uid, rotated_at, generation`) — no TTL; long-lived\n- `miroir:search_ui_scoped_key_observed::` hash with 60s EXPIRE\n- `miroir:admin_session:revoked` Pub/Sub channel (logout invalidation)\n- `miroir:ratelimit:searchui:` with `EXPIRE search_ui.rate_limit.redis_ttl_s`\n- `miroir:ratelimit:adminlogin:` + `miroir:ratelimit:adminlogin:backoff:` (hash `{failed_count, next_allowed_at}`)\n- `miroir:cdc:overflow:` list (1 GiB cap via `cdc.buffer.redis_bytes`)\n\n## Why\n\nPlan §14.4: `replicas > 1` **requires** Redis. The trait-based abstraction means Phase 6 HPA just flips `task_store.backend: redis` via Helm values; no code change in feature layers.\n\n## Details\n\n**Secondary `_index` sets** are the key optimization: list-wide queries (e.g., `GET /_miroir/aliases`) iterate the set, not `SCAN`. Any `insert` must also `SADD` to the index; any `delete` must `SREM`.\n\n**Leader lease**: `SET NX EX 10`. Renewal is `SET XX EX 10` — only if we still hold it. Lease-loss mid-operation is plan §14.5 Mode B's recovery path.\n\n**EXPIRE on idempotency / session / admin_session / search_ui rate limit** — let Redis garbage-collect rather than running a Mode A pruner for each.\n\n**CDC overflow**: use `LPUSH` + `LTRIM` to bound list length; `LLEN` gives `miroir_cdc_buffer_bytes` (approximate).\n\n**Pipelining**: for the task fan-out mapping (one write → N node task IDs), use MULTI/EXEC to insert the tasks row + SADD the index set atomically.\n\n## Acceptance\n\n- [ ] testcontainers-based integration test: identical trait-level behavior to SQLite backend (run the shared CRUD suite against both)\n- [ ] Lease race: two pods `SET NX EX` simultaneously → exactly one wins\n- [ ] Memory budget: at 10k idempotency keys + 1k sessions + 100k tasks, Redis RSS stays under plan §14.7 accounting target\n- [ ] Pub/Sub: subscribe to `miroir:admin_session:revoked` and confirm logout on pod-A invalidates pod-B's in-memory cache within 100ms","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:30:07.307470462Z","created_by":"coding","updated_at":"2026-04-18T21:30:11.196023954Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-3"],"dependencies":[{"issue_id":"miroir-r3j.3","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.307470462Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.3","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.196004625Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-r3j.4","title":"P3.4 Migration + schema versioning","description":"## What\n\nImplement a first-class schema version system:\n- `schema_versions` table (SQLite) / `miroir:schema_version` key (Redis) recording the most recently applied migration\n- Each schema change gets a numbered migration (`001_initial.sql`, `002_add_foo.sql`, etc.)\n- Startup: read current version → apply all migrations with higher numbers → record latest\n- Refuse to start if DB version > binary version (e.g., operator rolled back to an older binary without rolling back the store)\n\n## Why\n\nPlan §12 commits to \"Config file schema: backward-compatible in minor versions (new fields always optional with defaults)\" and \"Task store schema requires migration notes (§7 release checklist).\" A versioning system forces that discipline from v0.1; shipping v1.0 with ad-hoc ALTER TABLE scatter is a nightmare to undo.\n\n## Details\n\n**Numbering**: monotonic `uXXX` where `u` is `000` to `999`; version history embedded in the binary via `include_str!` from a known directory.\n\n**Down-migration is optional** — we write migrations as one-way by default. For rollback, operators restore from backup rather than `downgrade 042→041`. Beads keep this door open; don't lock it shut.\n\n**Binary-vs-store version check**:\n- binary version = max migration number compiled into the binary\n- store version = max migration applied\n- start-up: if `binary < store`, refuse with a clear error. If `binary == store`, no-op. If `binary > store`, apply missing migrations.\n\n## Acceptance\n\n- [ ] First run creates the schema at version 001 (or whatever is the initial)\n- [ ] Second run is a no-op; migration scan is a single SELECT\n- [ ] Artificially set store version to binary+1 → startup fails with `schema_version_ahead` error\n- [ ] Both SQLite and Redis backends share the same migration metadata structure","status":"open","priority":1,"issue_type":"task","created_at":"2026-04-18T21:30:07.338809736Z","created_by":"coding","updated_at":"2026-04-18T21:30:11.210532127Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-3"],"dependencies":[{"issue_id":"miroir-r3j.4","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.338809736Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.4","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.210512282Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-r3j.2","title":"P3.2 SQLite backend: remaining tables (canaries, cdc_cursors, tenant_map, rollover_policies, search_ui_config, admin_sessions)","description":"## What\n\nExtend the SQLite `TaskStore` with plan §4 tables 8–14:\n8. `canaries` (§13.18)\n9. `canary_runs` (§13.18) — bounded by `canary_runner.run_history_per_canary` (default 100); auto-prune on insert\n10. `cdc_cursors` (§13.13)\n11. `tenant_map` (§13.15 `api_key` mode only)\n12. `rollover_policies` (§13.17)\n13. `search_ui_config` (§13.21)\n14. `admin_sessions` (§13.19) — with `CREATE INDEX admin_sessions_expires ON admin_sessions(expires_at)` for lazy eviction\n\n## Why Separate from P3.1\n\nThese tables are **feature-flag-gated** — `canaries` only instantiates when `canary_runner.enabled`, etc. Keeping them in a separate task lets Phase 5 subsection beads own each table's lifecycle and prevents the ~14-table `CREATE TABLE IF NOT EXISTS` cascade from running for features that will never be used.\n\nThat said, the schema definition itself lives here so every Phase 5 feature can `use` the same typed row structs rather than redefining them ad-hoc.\n\n## Details\n\n**`canary_runs` auto-prune**: on each insert, `DELETE FROM canary_runs WHERE canary_id = ? AND ran_at < (SELECT MIN(ran_at) FROM (SELECT ran_at FROM canary_runs WHERE canary_id = ? ORDER BY ran_at DESC LIMIT N))`. Wrap in a trigger so application code never forgets.\n\n**`admin_sessions.expires_at` index** — plan §4 admin_sessions footnote: rows past expires_at evicted lazily on access AND by Mode A pruner (§14.5). The index makes the scan cheap.\n\n**`cdc_cursors` is a per-(sink, index) composite PK** — both columns must match for update-in-place.\n\n**`tenant_map.api_key_hash` is a 32-byte BLOB** — raw sha256 bytes; never store the plaintext API key.\n\n## Acceptance\n\n- [ ] Every table's typed struct round-trips `insert`/`get` in a unit test\n- [ ] `canary_runs` trigger keeps row count ≤ `run_history_per_canary`\n- [ ] Tables that remain empty when their feature is disabled consume < 16 KB each (SQLite overhead)\n- [ ] Tables are created only when `TaskStore::migrate` is called with the relevant feature flag set (so dev-mode single-pod with all features off creates just 7 tables)","status":"closed","priority":0,"issue_type":"task","assignee":"charlie","created_at":"2026-04-18T21:30:07.286925769Z","created_by":"coding","updated_at":"2026-04-19T04:16:44.966812055Z","closed_at":"2026-04-19T04:16:44.966701101Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:2","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.2","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.286925769Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.2","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.179800727Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-r3j.3","title":"P3.3 Redis backend: same trait, Redis keyspace per plan §4","description":"## What\n\nImplement the Redis-backed `TaskStore` mirroring every SQLite table to the keyspace layout in plan §4 \"Redis mode (HA)\":\n\n| SQLite | Redis |\n|--------|-------|\n| `tasks` row | `miroir:tasks:` hash + `miroir:tasks:_index` set |\n| `node_settings_version` | `miroir:node_settings_version::` hash + index set |\n| `aliases` | `miroir:aliases:` hash + index set |\n| `sessions` | `miroir:session:` hash with `EXPIRE session_pinning.ttl_seconds` |\n| `idempotency_cache` | `miroir:idemp:` hash with `EXPIRE idempotency.ttl_seconds` |\n| `jobs` | `miroir:jobs:` hash + `miroir:jobs:_queued` set (HPA signal) |\n| `leader_lease` | `miroir:lease:` string via `SET NX EX 10` renewed every 3s |\n| `canaries` | `miroir:canary:` hash + index set |\n| `canary_runs` | `miroir:canary_runs:` sorted set keyed by `ran_at`; `ZREMRANGEBYRANK` trim |\n| `cdc_cursors` | `miroir:cdc_cursor::` string (integer seq) |\n| `tenant_map` | `miroir:tenant_map:` hash |\n| `rollover_policies` | `miroir:rollover:` hash + index set |\n| `search_ui_config` | `miroir:search_ui_config:` hash |\n| `admin_sessions` | `miroir:admin_session:` hash with `EXPIRE session_ttl_s` + revoked bool |\n\nPlus the extras from plan §4 footnotes:\n- `miroir:search_ui_scoped_key:` hash (fields `primary_uid, previous_uid, rotated_at, generation`) — no TTL; long-lived\n- `miroir:search_ui_scoped_key_observed::` hash with 60s EXPIRE\n- `miroir:admin_session:revoked` Pub/Sub channel (logout invalidation)\n- `miroir:ratelimit:searchui:` with `EXPIRE search_ui.rate_limit.redis_ttl_s`\n- `miroir:ratelimit:adminlogin:` + `miroir:ratelimit:adminlogin:backoff:` (hash `{failed_count, next_allowed_at}`)\n- `miroir:cdc:overflow:` list (1 GiB cap via `cdc.buffer.redis_bytes`)\n\n## Why\n\nPlan §14.4: `replicas > 1` **requires** Redis. The trait-based abstraction means Phase 6 HPA just flips `task_store.backend: redis` via Helm values; no code change in feature layers.\n\n## Details\n\n**Secondary `_index` sets** are the key optimization: list-wide queries (e.g., `GET /_miroir/aliases`) iterate the set, not `SCAN`. Any `insert` must also `SADD` to the index; any `delete` must `SREM`.\n\n**Leader lease**: `SET NX EX 10`. Renewal is `SET XX EX 10` — only if we still hold it. Lease-loss mid-operation is plan §14.5 Mode B's recovery path.\n\n**EXPIRE on idempotency / session / admin_session / search_ui rate limit** — let Redis garbage-collect rather than running a Mode A pruner for each.\n\n**CDC overflow**: use `LPUSH` + `LTRIM` to bound list length; `LLEN` gives `miroir_cdc_buffer_bytes` (approximate).\n\n**Pipelining**: for the task fan-out mapping (one write → N node task IDs), use MULTI/EXEC to insert the tasks row + SADD the index set atomically.\n\n## Acceptance\n\n- [ ] testcontainers-based integration test: identical trait-level behavior to SQLite backend (run the shared CRUD suite against both)\n- [ ] Lease race: two pods `SET NX EX` simultaneously → exactly one wins\n- [ ] Memory budget: at 10k idempotency keys + 1k sessions + 100k tasks, Redis RSS stays under plan §14.7 accounting target\n- [ ] Pub/Sub: subscribe to `miroir:admin_session:revoked` and confirm logout on pod-A invalidates pod-B's in-memory cache within 100ms","status":"in_progress","priority":0,"issue_type":"task","assignee":"alpha","created_at":"2026-04-18T21:30:07.307470462Z","created_by":"coding","updated_at":"2026-04-19T04:13:35.626327283Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.3","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.307470462Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.3","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.196004625Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-r3j.4","title":"P3.4 Migration + schema versioning","description":"## What\n\nImplement a first-class schema version system:\n- `schema_versions` table (SQLite) / `miroir:schema_version` key (Redis) recording the most recently applied migration\n- Each schema change gets a numbered migration (`001_initial.sql`, `002_add_foo.sql`, etc.)\n- Startup: read current version → apply all migrations with higher numbers → record latest\n- Refuse to start if DB version > binary version (e.g., operator rolled back to an older binary without rolling back the store)\n\n## Why\n\nPlan §12 commits to \"Config file schema: backward-compatible in minor versions (new fields always optional with defaults)\" and \"Task store schema requires migration notes (§7 release checklist).\" A versioning system forces that discipline from v0.1; shipping v1.0 with ad-hoc ALTER TABLE scatter is a nightmare to undo.\n\n## Details\n\n**Numbering**: monotonic `uXXX` where `u` is `000` to `999`; version history embedded in the binary via `include_str!` from a known directory.\n\n**Down-migration is optional** — we write migrations as one-way by default. For rollback, operators restore from backup rather than `downgrade 042→041`. Beads keep this door open; don't lock it shut.\n\n**Binary-vs-store version check**:\n- binary version = max migration number compiled into the binary\n- store version = max migration applied\n- start-up: if `binary < store`, refuse with a clear error. If `binary == store`, no-op. If `binary > store`, apply missing migrations.\n\n## Acceptance\n\n- [ ] First run creates the schema at version 001 (or whatever is the initial)\n- [ ] Second run is a no-op; migration scan is a single SELECT\n- [ ] Artificially set store version to binary+1 → startup fails with `schema_version_ahead` error\n- [ ] Both SQLite and Redis backends share the same migration metadata structure","status":"closed","priority":1,"issue_type":"task","assignee":"alpha","created_at":"2026-04-18T21:30:07.338809736Z","created_by":"coding","updated_at":"2026-04-19T04:17:36.370998673Z","closed_at":"2026-04-19T04:17:36.370920117Z","close_reason":"P3.4: Schema versioning system implemented and verified\n\nImplementation:\n- schema_versions table tracks applied migrations\n- MigrationRegistry with build_registry() using include_str! for migrations\n- 001_initial.sql creates schema_versions + tables 1-7\n- 002_feature_tables.sql creates tables 8-14 (feature-flagged)\n- run_migration() validates version and applies pending migrations\n- SchemaVersionAhead error when store version > binary version\n\nAcceptance criteria met:\n✅ First run creates schema at version 001\n✅ Second run is no-op (single SELECT for version check)\n✅ Store version > binary version fails with SchemaVersionAhead error\n✅ Migration metadata structure is backend-agnostic (ready for Redis)\n\nAll 114 tests pass including migration tests:\n- migration_is_idempotent\n- schema_version_recorded\n- schema_version_ahead_fails\n\nCommitted in 3f7b1ac","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:2","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.4","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.338809736Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.4","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.210512282Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-r3j.5","title":"P3.5 values.schema.json rejection: replicas>1 requires Redis","description":"## What\n\nAdd an entry to `charts/miroir/values.schema.json` that **fails `helm lint`** when `miroir.replicas > 1` and `taskStore.backend == \"sqlite\"`.\n\n## Why\n\nPlan §14.4: \"SQLite is single-writer and cannot be shared. The Helm chart enforces this: `taskStore.backend=sqlite` with `miroir.replicas > 1` fails values-schema validation.\" Without this guard, a developer who bumps `replicas: 2` in values.yaml and forgets to flip the backend gets silent task-store divergence across pods — every pod writes to its own SQLite in its own ephemeralVolume, mtask polls on pod-A can't see tasks enqueued on pod-B.\n\n## Details\n\nUse JSON Schema `if/then`:\n```jsonc\n{\n \"if\": { \"properties\": { \"miroir\": { \"properties\": { \"replicas\": { \"type\": \"integer\", \"exclusiveMinimum\": 1 } } } } },\n \"then\": { \"properties\": { \"taskStore\": { \"properties\": { \"backend\": { \"const\": \"redis\" } } } } }\n}\n```\n\nAdd `helm lint --strict` cases to Phase 9 test harness:\n- `replicas: 1, backend: sqlite` → lint passes\n- `replicas: 2, backend: sqlite` → lint fails with a clear error message\n- `replicas: 2, backend: redis` → lint passes\n\n## Acceptance\n\n- [ ] `helm lint --strict` on a values file with `replicas: 2 + backend: sqlite` fails with a message pointing at the constraint\n- [ ] The failure message is operator-readable (\"SQLite task store cannot run with multiple replicas; set taskStore.backend=redis\") — use `errorMessage` extension if available, else accept the default output\n- [ ] Test cases added to `charts/miroir/tests/` for future-proofing","status":"closed","priority":1,"issue_type":"task","assignee":"delta","created_at":"2026-04-18T21:30:07.373576976Z","created_by":"coding","updated_at":"2026-04-19T03:45:51.195402118Z","closed_at":"2026-04-19T03:45:51.195338621Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.5","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.373576976Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-r3j.6","title":"P3.6 Task registry TTL pruner (in-memory for Phase 3; Mode A in Phase 6)","description":"## What\n\nImplement a background task that prunes `tasks` rows older than `task_registry.ttl_seconds` (default 7 days per plan §4). In Phase 3 this runs single-pod with an advisory lock; Phase 6 §14.5 Mode A replaces with rendezvous-partitioned ownership.\n\n## Why\n\nWithout TTL pruning, the task table grows unbounded. Plan §4 explicitly calls out the Mode A rendezvous pruner as the mechanism; shipping the simpler single-pod version here lets single-pod dev deployments not leak memory, and Phase 6 just swaps the ownership rule.\n\n## Details\n\n**Cadence**: run every `task_registry.prune_interval_s` (default 300s / 5 min).\n\n**Batch size**: max 10k rows per iteration so the background task never holds the DB long. SQLite: `DELETE FROM tasks WHERE created_at < ? LIMIT 10000`.\n\n**Preservation rule**: never prune a task whose `status` is `processing` (poll results might still be incoming). Plan this as \"age > TTL AND status IN (succeeded, failed, canceled)\".\n\n**Metrics**: `miroir_task_registry_size` (gauge) exposed per plan §10. The pruner updates it.\n\n## Acceptance\n\n- [ ] After insert of 10k terminal tasks with `created_at = now - 8d`, next pruner cycle drops all 10k\n- [ ] A single in-flight `processing` task at `created_at = now - 10d` is preserved\n- [ ] Pruner advisory lock prevents two instances pruning simultaneously (single-pod guarantee; Phase 6 replaces)\n- [ ] `miroir_task_registry_size` gauge drops after a prune cycle","status":"open","priority":1,"issue_type":"task","created_at":"2026-04-18T21:30:07.405347149Z","created_by":"coding","updated_at":"2026-04-18T21:30:11.223295168Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase-3"],"dependencies":[{"issue_id":"miroir-r3j.6","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.405347149Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.6","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.223268357Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-r3j.6","title":"P3.6 Task registry TTL pruner (in-memory for Phase 3; Mode A in Phase 6)","description":"## What\n\nImplement a background task that prunes `tasks` rows older than `task_registry.ttl_seconds` (default 7 days per plan §4). In Phase 3 this runs single-pod with an advisory lock; Phase 6 §14.5 Mode A replaces with rendezvous-partitioned ownership.\n\n## Why\n\nWithout TTL pruning, the task table grows unbounded. Plan §4 explicitly calls out the Mode A rendezvous pruner as the mechanism; shipping the simpler single-pod version here lets single-pod dev deployments not leak memory, and Phase 6 just swaps the ownership rule.\n\n## Details\n\n**Cadence**: run every `task_registry.prune_interval_s` (default 300s / 5 min).\n\n**Batch size**: max 10k rows per iteration so the background task never holds the DB long. SQLite: `DELETE FROM tasks WHERE created_at < ? LIMIT 10000`.\n\n**Preservation rule**: never prune a task whose `status` is `processing` (poll results might still be incoming). Plan this as \"age > TTL AND status IN (succeeded, failed, canceled)\".\n\n**Metrics**: `miroir_task_registry_size` (gauge) exposed per plan §10. The pruner updates it.\n\n## Acceptance\n\n- [ ] After insert of 10k terminal tasks with `created_at = now - 8d`, next pruner cycle drops all 10k\n- [ ] A single in-flight `processing` task at `created_at = now - 10d` is preserved\n- [ ] Pruner advisory lock prevents two instances pruning simultaneously (single-pod guarantee; Phase 6 replaces)\n- [ ] `miroir_task_registry_size` gauge drops after a prune cycle","status":"in_progress","priority":1,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:30:07.405347149Z","created_by":"coding","updated_at":"2026-04-19T04:20:18.801094826Z","close_reason":"P3.6: Implemented TTL pruner for task registry. Background pruner batch-deletes terminal tasks older than ttl_seconds with advisory lock. All 5 acceptance tests pass.","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","phase-3"],"dependencies":[{"issue_id":"miroir-r3j.6","depends_on_id":"miroir-r3j","type":"parent-child","created_at":"2026-04-18T21:30:07.405347149Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-r3j.6","depends_on_id":"miroir-r3j.1","type":"blocks","created_at":"2026-04-18T21:30:11.223268357Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-uhj","title":"Phase 5 — Advanced Capabilities (§13.1–§13.21)","description":"## Phase 5 Epic — Advanced Capabilities\n\nShips all 21 §13 capabilities. Each is orchestrator-side only (no Meilisearch node modification), individually togglable via a config flag, and defaults chosen to be low-risk. Four of them (§13.1, §13.5, §13.8, §13.9) directly resolve Open Problems in §15; the remaining 17 harden latency, correctness, and client ergonomics.\n\n## Why These Are Grouped\n\nPlan §13 preamble: \"All capabilities are individually togglable and default to conservative values.\" They are logically one epic because they share:\n- A single config-flag contract (`enabled: bool` per subsection)\n- The same orchestrator invariant (no node-side patches, unmodified CE)\n- The same task-store tables (defined in Phase 3)\n- The same HA coordination primitives (Phase 6 Modes A/B/C)\n\nSplitting them across phases would produce misleading dependency edges — in reality each §13.x is independent and can be built in parallel.\n\n## Subsections (each becomes one task bead under this epic)\n\n- §13.1 Online resharding via shadow index (OP#3)\n- §13.2 Hedged requests (tail latency)\n- §13.3 Adaptive replica selection (EWMA)\n- §13.4 Shard-aware query planner (PK-constrained)\n- §13.5 Two-phase settings broadcast + drift reconciler (OP#4)\n- §13.6 Read-your-writes via session pinning\n- §13.7 Atomic index aliases (single + multi-target)\n- §13.8 Anti-entropy shard reconciler (OP#1)\n- §13.9 Streaming routed dump import (OP#5)\n- §13.10 Idempotency keys + query coalescing\n- §13.11 Multi-search batch API\n- §13.12 Vector + hybrid search sharding (over-fetch + RRF/convex)\n- §13.13 CDC stream (webhook / NATS / Kafka / internal queue)\n- §13.14 Document TTL + automatic expiration\n- §13.15 Tenant-to-replica-group affinity\n- §13.16 Traffic shadow / teeing to staging\n- §13.17 Rolling time-series indexes (ILM)\n- §13.18 Synthetic canary queries + golden assertions\n- §13.19 Admin UI (embedded SPA via rust-embed)\n- §13.20 Query explain API\n- §13.21 End-user search UI (embedded SPA + JWT brokering + scoped-key rotation)\n\n## Cross-Feature Interactions to Preserve\n\n- §13.1 reshard's step 5 = §13.7 alias flip\n- §13.5 `settings_version` consumed by §13.6 session pin + §13.10 query-coalescing fingerprint + §13.20 explain\n- §13.8 expired-doc branch calls `_miroir_expires_at` (§13.14 interaction)\n- §13.13 CDC suppression via `_miroir_origin` tag (set by §13.1 backfill, §13.8 repair, §13.14 sweep, §13.17 rollover)\n- §13.17 `read_alias` is a §13.7 multi-target alias only ILM may edit\n- §13.19 Admin UI surfaces §13.5 2PC preview, §13.16 shadow diff, §13.13 CDC tail, §13.20 explain\n- §13.21 Search UI uses §13.11 multi-search, §13.10 coalescing, §13.6 session pinning; JWT signed via `SEARCH_UI_JWT_SECRET` with §9 dual-secret rotation\n\n## Definition of Done\n\n- [ ] All 21 subsection task beads closed\n- [ ] Every `enabled: true` default from the plan honored\n- [ ] Every cross-reference listed above validated by an integration test\n- [ ] Every §10/§14 metric family registered and scraping on the right port\n- [ ] §9 secret inventory updated (ADMIN_SESSION_SEAL_KEY, SEARCH_UI_JWT_SECRET, search_ui_shared_key)","status":"open","priority":0,"issue_type":"epic","created_at":"2026-04-18T21:19:54.006891677Z","created_by":"coding","updated_at":"2026-04-18T21:23:08.634562113Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["phase","phase-5"],"dependencies":[{"issue_id":"miroir-uhj","depends_on_id":"miroir-9dj","type":"blocks","created_at":"2026-04-18T21:23:08.621245444Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-uhj","depends_on_id":"miroir-r3j","type":"blocks","created_at":"2026-04-18T21:23:08.634544009Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-uhj.1","title":"P5.1 §13.1 Online resharding via shadow index (OP#3)","description":"## What\n\nImplement the six-phase online resharding flow from plan §13.1:\n\n1. **Shadow create**: `{uid}__reshard_{S_new}` on every node with the new S, settings propagated via §13.5 two-phase broadcast\n2. **Dual-hash dual-write**: live writes go to both `{uid}` (hash %S_old) and `{uid}__reshard_{S_new}` (hash %S_new) with `_miroir_shard` injected per index's own S\n3. **Backfill**: background streamer pages every live-index shard via `filter=_miroir_shard={id}`, re-hashes each doc under S_new, writes to shadow; tagged `_miroir_origin: reshard_backfill` so §13.13 CDC suppresses\n4. **Verify**: cross-index PK-set comparator + content-hash fingerprint between live and shadow (reuses §13.8 bucketed-Merkle machinery but keyed by PK since live/shadow have different S)\n5. **Alias swap**: atomic §13.7 `PUT /_miroir/aliases/{uid}` to the shadow; dual-write stops\n6. **Cleanup**: live retained for `retain_old_index_hours` (default 48h) for emergency rollback, then deleted\n\n## Why\n\nPlan §15 Open Problem 3: \"The 'choose S generously' guidance remains the recommended default because online resharding doubles transient storage and write load; treat §13.1 as a remediation, not a license to under-provision.\" This is the safety valve — without it, under-provisioned clusters face a full external reindex.\n\n## Details\n\n**Scaling mode (plan §14.6)**: Mode B (leader for phase state machine) + Mode C (backfill chunks queued as jobs).\n\n**Failure handling** (plan §13.1): any failure before step 5 → delete shadow, invisible to clients. After step 5, rollback is a reverse alias flip to the retained live index.\n\n**CDC suppression**: §13.13 filters by `_miroir_origin: reshard_backfill` so subscribers don't see shadow writes as duplicates of live writes. Configured via `cdc.emit_internal_writes: false` (default).\n\n**Cross-index PK verify** is NOT the same as §13.8 within-shard reconciler — different S means different `_miroir_shard` values. Bucketing by `pk-hash % 256` gives a comparable space across indexes.\n\n**Admin API + CLI** (plan §4 admin table + §13.1):\n- `POST /_miroir/indexes/{uid}/reshard` body `{\"new_shards\": 256, \"throttle_docs_per_sec\": 10000}`\n- `GET /_miroir/indexes/{uid}/reshard/status`\n- `miroir-ctl reshard --index products --new-shards 256 --throttle 10000 [--dry-run]`\n\n## Acceptance\n\n- [ ] Reshard 64→128 on a 1M-doc index; post-swap search returns identical hits for golden queries\n- [ ] Mid-backfill failure: shadow deleted, client sees zero impact\n- [ ] Post-swap rollback: `PUT /_miroir/aliases/{uid} {\"target\": \"\"}` within 48h restores; aliased reads hit the old data\n- [ ] `miroir_reshard_phase` gauge transitions 0→1→2→3→4→5→0\n- [ ] Backfill throttles to `throttle_docs_per_sec` during peak business hours; disk footprint stays under 2× corpus during dual-write","status":"open","priority":0,"issue_type":"task","created_at":"2026-04-18T21:33:36.737028315Z","created_by":"coding","updated_at":"2026-04-18T21:38:33.137777638Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["advanced-13","phase-5"],"dependencies":[{"issue_id":"miroir-uhj.1","depends_on_id":"miroir-uhj","type":"parent-child","created_at":"2026-04-18T21:33:36.737028315Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-uhj.1","depends_on_id":"miroir-uhj.5","type":"blocks","created_at":"2026-04-18T21:38:33.123026198Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-uhj.1","depends_on_id":"miroir-uhj.7","type":"blocks","created_at":"2026-04-18T21:38:33.137757362Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-uhj.1.1","title":"P5.1.a Shadow create phase: new index on every node via §13.5 broadcast","description":"Reshard step 1 (plan §13.1). Create {uid}__reshard_{S_new} on every node with new S; propagate live index's settings via §13.5 two-phase broadcast. Shadow is not client-addressable. Failure here deletes the shadow — invisible to clients.","status":"open","priority":1,"issue_type":"task","created_at":"2026-04-18T21:50:32.931816015Z","created_by":"coding","updated_at":"2026-04-18T21:50:32.931816015Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["advanced-13","phase-5"],"dependencies":[{"issue_id":"miroir-uhj.1.1","depends_on_id":"miroir-uhj.1","type":"parent-child","created_at":"2026-04-18T21:50:32.931816015Z","created_by":"coding","metadata":"{}","thread_id":""}]} @@ -140,6 +140,7 @@ {"id":"miroir-zc2.1","title":"P12.OP1 Shard migration write safety — cutover race window analysis","description":"## What\n\nPlan §15 Open Problem #1: \"Dual-write during migration must not lose documents that arrive exactly at the migration cutover boundary.\"\n\n**Status** per plan: partially addressed. Race window mitigated by §13.8 anti-entropy; any slipped doc caught on next reconciliation pass.\n\n**Remaining work**:\n- Chaos-test the cutover boundary — specifically: docs arriving at the instant of `active` transition (step 7 in plan §2 \"Adding a node\")\n- Document any reproducible window where data could be lost if anti-entropy is disabled\n- If found: extend Phase 4 dual-write to hold the window longer OR require anti-entropy to be on (hard-coded policy)\n\n## Why\n\n\"Plan §15 Open Problem 1 closure\" has been claimed in §13.8 — this bead verifies that claim empirically before we ship v1.0 committing to it.\n\n## Details\n\n**Chaos test design**:\n1. Start 3-node cluster, write 1000 docs\n2. Trigger node addition (`POST /_miroir/nodes`)\n3. During dual-write, rapid-fire new writes with tight (1ms) interval\n4. Tight-loop the transition from step 4 (migration complete) to step 7 (old replica deleted)\n5. Assert: every written doc retrievable AFTER step 7\n\n**Variants**:\n- With anti-entropy enabled (default) — expect 100% retrievable\n- With anti-entropy **disabled** — measure loss rate. If > 0, document + add a schema constraint refusing to enable migrations when anti-entropy is off\n\n## Acceptance\n\n- [ ] Chaos test published; runs on every v1.0-gating CI run\n- [ ] Loss rate measured at < 1 per 1M writes with AE on\n- [ ] Loss rate measured without AE; decision documented in `docs/trade-offs.md`\n- [ ] If `anti_entropy.enabled: false` + migration concurrent → loud warning log + (decided) refuse or warn","status":"closed","priority":2,"issue_type":"bug","assignee":"alpha","created_at":"2026-04-18T21:49:47.774525899Z","created_by":"coding","updated_at":"2026-04-19T02:01:02.057461283Z","closed_at":"2026-04-19T02:01:02.057395870Z","close_reason":"done","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.1","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.774525899Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-zc2.2","title":"P12.OP2 Task state HA — evaluate lightweight Raft vs. Redis requirement","description":"## What\n\nPlan §15 Open Problem #2: \"SQLite is single-writer. Running 2 Miroir replicas requires Redis. A future enhancement is a lightweight Raft-based in-process consensus so Redis is not required for HA mode.\"\n\n**Status** per plan: deferred. Current solution (Redis) works; Raft would remove an external dependency.\n\n**Research work**:\n- Survey embedded Raft crates: `openraft`, `raft-rs`, `async-raft`\n- Prototype: `TaskStore` trait impl backed by Raft state machine\n- Measure: latency + throughput vs. Redis; memory footprint per plan §14.2\n- Decide: ship in v1.x or never\n\n## Why\n\nRemoving Redis as a hard dependency shrinks the operational surface (one less thing to monitor, backup, rotate secrets for). But Raft adds complexity — a bad Raft impl can eat data in ways Redis doesn't.\n\nNot blocking v0.x or v1.0 — but worth prototyping before v2.0.\n\n## Details\n\n**Decision gate**: the Raft-backed path must be measurably better than Redis on at least one metric (ops simplicity, latency, or memory) without being worse on any of the others, before shipping.\n\n**Output**: `docs/research/raft-task-store.md` with the decision + benchmark data + reasoning. Keep or discard based on findings.\n\n## Acceptance\n\n- [ ] Research doc published with prototype branch linked\n- [ ] Decision recorded: ship / don't ship / revisit when","status":"closed","priority":3,"issue_type":"feature","assignee":"bravo","created_at":"2026-04-18T21:49:47.798646718Z","created_by":"coding","updated_at":"2026-04-19T02:57:16.452177084Z","closed_at":"2026-04-19T02:57:16.452114067Z","close_reason":"P12.OP2 complete. Surveyed openraft/raft-rs/async-raft (recommend openraft if revisited). Built feature-gated Raft state machine prototype at crates/miroir-core/src/raft_proto/ with benchmarks. Decision: do not ship Raft in v0.x/v1.0 -- Redis wins on write latency, throughput, correctness maturity, and operational tooling. Raft only wins on ops simplicity and read latency. Does not pass the decision gate. Revisit before v2.0 when Redis backend is production-stabilized and openraft reaches v1.0. Full analysis in docs/research/raft-task-store.md.","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.2","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.798646718Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-zc2.3","title":"P12.OP3 Online resharding — validate 2× transient load caveat under real corpora","description":"## What\n\nPlan §15 Open Problem #3: §13.1 online resharding ships as a remediation, NOT a license to under-provision. Plan: \"doubles transient storage and write load; treat §13.1 as a remediation, not a license to under-provision.\"\n\n**Remaining work**:\n- Empirical validation of the 2× storage + write load estimate under real corpora (varied doc sizes, write rates, settings complexity)\n- CLI schedule guidance: `miroir-ctl reshard --schedule-window off-peak` — refuses to start outside a named window unless `--force`\n\n## Why\n\nOperators will over-commit to resharding if the \"2× transient\" caveat turns out to be 3× or worse in practice. Real numbers prevent that.\n\n## Details\n\n**Test matrix**:\n| Doc size | Corpus | Write rate | RG | RF | Measured peak storage |\n|----------|--------|------------|----|----|-----------------------|\n| 1 KB | 10 GB | 100 dps | 2 | 1 | ? |\n| 10 KB | 100 GB | 1000 dps | 2 | 2 | ? |\n| 1 MB (blobs) | 1 TB | 10 dps | 2 | 1 | ? |\n\nPublish results in `docs/benchmarks/resharding-load.md`.\n\n**CLI window guard**: config knob `resharding.allowed_windows: [\"02:00-06:00 UTC\"]`. CLI refuses outside windows without `--force`.\n\n## Acceptance\n\n- [ ] Benchmark doc published with real numbers\n- [ ] CLI window guard implemented; integration test confirms rejection outside window\n- [ ] Benchmark run in Phase 9 performance suite as part of v1.0 validation","status":"closed","priority":3,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:49:47.828099118Z","created_by":"coding","updated_at":"2026-04-19T02:09:48.450456008Z","closed_at":"2026-04-19T02:09:48.450390357Z","close_reason":"P12.OP3 complete. All acceptance criteria verified:\n\n1. Benchmark doc (docs/benchmarks/resharding-load.md): Published with results for all 3 scenarios (1KB/10GB, 10KB/100GB, 1MB/1TB). Storage amplification confirmed at exactly 2.00x and dual-write amplification at exactly 2.00x across all scenarios.\n\n2. CLI schedule window guard: Implemented in miroir-ctl reshard command. Config knob resharding.allowed_windows restricts resharding to named windows. CLI refuses outside windows unless --force given.\n\n3. Integration tests (window_guard.rs): 4 tests all passing. 24 total resharding tests pass.\n\n4. Benchmark binary (reshard_load.rs): Full simulation using actual routing code, validates invariants.","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.3","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.828099118Z","created_by":"coding","metadata":"{}","thread_id":""}]} -{"id":"miroir-zc2.4","title":"P12.OP4 Score normalization at scale — statistical validation of cross-shard comparability","description":"## What\n\nPlan §15 Open Problem #4: \"`_rankingScore` is comparable across shards only when index settings are identical.\" Settings divergence addressed by §13.5; remaining concern is statistical — do scores stay comparable when shards have very different document-count distributions?\n\n**Research work**:\n- Build a test corpus with intentionally skewed shard populations (one shard 100×, another shard 0.01× the median)\n- Submit identical queries; measure score distribution per shard\n- Assert: top-K merged ordering matches a ground-truth single-index version within some ε\n- If large ε, document + possibly introduce a score normalization pass\n\n## Why\n\nElasticsearch (plan research doc §1) hits this exactly: \"BM25 scoring depends on IDF, computed per shard by default using only that shard's local term statistics.\" Meilisearch uses its own ranking pipeline, but the same issue applies — local rank stats can drift from global on skewed shards.\n\n## Details\n\n**Ground truth**: single-index Meilisearch running the same queries against the same corpus.\n\n**Divergence metric**: Kendall τ between Miroir result ordering and single-index result ordering across 10k random queries.\n\n**If τ < 0.95 on average**: investigate whether a global IDF-style preflight is worth adding (plan research §1 \"`dfs_query_then_fetch`\" pattern).\n\n**Output**: `docs/research/score-normalization-at-scale.md`.\n\n## Acceptance\n\n- [ ] Benchmark corpus + query set published in `tests/benches/score-comparability/`\n- [ ] Results reported with confidence intervals\n- [ ] If τ < 0.95: follow-up bead created for a normalization pass\n- [ ] If τ ≥ 0.95: note-of-no-action in the bead's close comment","status":"closed","priority":3,"issue_type":"task","assignee":"alpha","created_at":"2026-04-18T21:49:47.849019120Z","created_by":"coding","updated_at":"2026-04-19T03:57:38.958634135Z","closed_at":"2026-04-19T03:57:38.958319731Z","close_reason":"## Summary\n\nCompleted research on score normalization at scale. Built benchmark infrastructure, generated test corpus with extreme shard skew (100× variance), and ran 10K queries through a BM25-based simulation.\n\n## Key Finding\n\n**Average Kendall tau: 0.79** (threshold: ≥ 0.95) — **FAIL**\n\nCross-shard score comparability is a **significant issue**:\n- Common-term queries: τ = 0.15 (catastrophic failure)\n- Local IDF statistics cause massive score inflation on small shards\n- Documents from tiny shards (10 docs) outrank relevant docs from large shards (93K docs)\n\n## Recommendation\n\nImplement Reciprocal Rank Fusion (RRF) for result merging — no preflight overhead, production-proven (OpenSearch).\n\n## Artifacts\n\n- Benchmark infrastructure: tests/benches/score-comparability/\n- Research writeup: docs/research/score-normalization-at-scale.md\n- Follow-up bead: miroir-nsu (RRF Merging Implementation)\n\n## Confidence\n\nNarrow 95% CI: ±0.01 on overall τ, ±0.02 on common-term τ. Results are reproducible via simulate.py","source_repo":".","compaction_level":0,"original_size":0,"labels":["open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.4","depends_on_id":"miroir-nsu","type":"blocks","created_at":"2026-04-19T03:56:41.560992652Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-zc2.4","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.849019120Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-zc2.4","title":"P12.OP4 Score normalization at scale — statistical validation of cross-shard comparability","description":"## What\n\nPlan §15 Open Problem #4: \"`_rankingScore` is comparable across shards only when index settings are identical.\" Settings divergence addressed by §13.5; remaining concern is statistical — do scores stay comparable when shards have very different document-count distributions?\n\n**Research work**:\n- Build a test corpus with intentionally skewed shard populations (one shard 100×, another shard 0.01× the median)\n- Submit identical queries; measure score distribution per shard\n- Assert: top-K merged ordering matches a ground-truth single-index version within some ε\n- If large ε, document + possibly introduce a score normalization pass\n\n## Why\n\nElasticsearch (plan research doc §1) hits this exactly: \"BM25 scoring depends on IDF, computed per shard by default using only that shard's local term statistics.\" Meilisearch uses its own ranking pipeline, but the same issue applies — local rank stats can drift from global on skewed shards.\n\n## Details\n\n**Ground truth**: single-index Meilisearch running the same queries against the same corpus.\n\n**Divergence metric**: Kendall τ between Miroir result ordering and single-index result ordering across 10k random queries.\n\n**If τ < 0.95 on average**: investigate whether a global IDF-style preflight is worth adding (plan research §1 \"`dfs_query_then_fetch`\" pattern).\n\n**Output**: `docs/research/score-normalization-at-scale.md`.\n\n## Acceptance\n\n- [ ] Benchmark corpus + query set published in `tests/benches/score-comparability/`\n- [ ] Results reported with confidence intervals\n- [ ] If τ < 0.95: follow-up bead created for a normalization pass\n- [ ] If τ ≥ 0.95: note-of-no-action in the bead's close comment","status":"open","priority":3,"issue_type":"task","created_at":"2026-04-18T21:49:47.849019120Z","created_by":"coding","updated_at":"2026-04-19T04:08:50.144020574Z","close_reason":"## Summary\n\nCompleted research on score normalization at scale. Built benchmark infrastructure, generated test corpus with extreme shard skew (100× variance), and ran 10K queries through a BM25-based simulation.\n\n## Key Finding\n\n**Average Kendall tau: 0.79** (threshold: ≥ 0.95) — **FAIL**\n\nCross-shard score comparability is a **significant issue**:\n- Common-term queries: τ = 0.15 (catastrophic failure)\n- Local IDF statistics cause massive score inflation on small shards\n- Documents from tiny shards (10 docs) outrank relevant docs from large shards (93K docs)\n\n## Recommendation\n\nImplement Reciprocal Rank Fusion (RRF) for result merging — no preflight overhead, production-proven (OpenSearch).\n\n## Artifacts\n\n- Benchmark infrastructure: tests/benches/score-comparability/\n- Research writeup: docs/research/score-normalization-at-scale.md\n- Follow-up bead: miroir-nsu (RRF Merging Implementation)\n\n## Confidence\n\nNarrow 95% CI: ±0.01 on overall τ, ±0.02 on common-term τ. Results are reproducible via simulate.py","source_repo":".","compaction_level":0,"original_size":0,"labels":["deferred","failure-count:1","open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.4","depends_on_id":"miroir-nsu","type":"blocks","created_at":"2026-04-19T03:56:41.560992652Z","created_by":"coding","metadata":"{}","thread_id":""},{"issue_id":"miroir-zc2.4","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.849019120Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-zc2.5","title":"P12.OP5 Dump import variants — enumerate what streaming mode can't handle","description":"## What\n\nPlan §15 Open Problem #5: §13.9 streaming routed dump import addresses the main case; broadcast mode retained as a fallback for dump variants Miroir cannot fully reconstruct via public API.\n\n**Remaining work**:\n- Identify and enumerate every dump variant streaming can't reconstruct\n- Either extend streaming to handle them OR document the fallback trigger clearly in `miroir-ctl dump import --help`\n\n## Why\n\n\"Can't reconstruct\" is vague — operators deserve concrete lists of what works and what doesn't. Without this, the `broadcast` fallback path is a bug waiting to happen.\n\n## Details\n\n**Potential failure modes to investigate**:\n- Dumps from older Meilisearch versions with pre-v1.37 schema\n- Dumps with custom keys (POST /keys) that have indexes list or actions not representable via public API\n- Dumps with snapshot-taken-mid-write where Miroir-injected `_miroir_shard` would conflict with an existing client field\n\n**Deliverable**: `docs/dump-import/compatibility-matrix.md` with columns:\n| Meilisearch version | Dump variant | Streaming works? | Broadcast needed? | Workaround |\n\n## Acceptance\n\n- [ ] Matrix published\n- [ ] Each \"broadcast needed\" row has a workaround or a link to an open enhancement bead\n- [ ] `miroir-ctl dump import` output references the matrix when falling back to broadcast","status":"closed","priority":3,"issue_type":"task","assignee":"bravo","created_at":"2026-04-18T21:49:47.884303207Z","created_by":"coding","updated_at":"2026-04-19T01:09:27.327131515Z","closed_at":"2026-04-19T01:09:27.327067549Z","close_reason":"Compatibility matrix published at docs/dump-import/compatibility-matrix.md\n\n- Matrix enumerates all dump variants that streaming mode can/cannot reconstruct\n- Each broadcast fallback row has workaround or enhancement bead link\n- CLI output reference section documents fallback message\n- Covers: version compatibility, field conflicts, EE features, snapshots, corrupted dumps","source_repo":".","compaction_level":0,"original_size":0,"labels":["failure-count:1","open-problem","phase-12","research"],"dependencies":[{"issue_id":"miroir-zc2.5","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.884303207Z","created_by":"coding","metadata":"{}","thread_id":""}]} {"id":"miroir-zc2.6","title":"P12.OP6 arm64 support (deferred to v1.x+)","description":"## What\n\nPlan §15 Open Problem #6: \"Not planned for v0.x. Added when K8s ARM node support is required.\"\n\n**Future work when prioritized**:\n- Cross-compile `miroir-proxy` and `miroir-ctl` for `aarch64-unknown-linux-musl` in the CI pipeline\n- Docker image manifest list: `ghcr.io/jedarden/miroir:` spans `linux/amd64` + `linux/arm64`\n- Helm chart: no changes (binary is arch-agnostic at the k8s layer)\n- Phase 9 CI: add arm64 test runs\n\n## Why\n\nARM node support is increasingly common (Hetzner Ampere, AWS Graviton, GCP Tau T2A, Rackspace Spot). But Miroir's fleet is currently all amd64 (iad-ci is amd64; ardenone cluster nodes are amd64). No current demand to justify the CI complexity.\n\nKeep this bead open as a placeholder; promote to in-progress when a concrete use case emerges.\n\n## Details\n\n**When ready**: the Argo Workflow `cargo-build` step needs a matrix over targets:\n```yaml\n- name: cargo-build\n container:\n args:\n - |\n rustup target add x86_64-unknown-linux-musl\n rustup target add aarch64-unknown-linux-musl\n apt-get install -qy musl-tools gcc-aarch64-linux-gnu\n cargo build --release --target x86_64-unknown-linux-musl -p miroir-proxy\n cargo build --release --target aarch64-unknown-linux-musl -p miroir-proxy\n ...\n```\n\nKaniko build needs `--customPlatform=linux/amd64,linux/arm64` or equivalent for multi-arch manifests.\n\n## Acceptance\n\n- [ ] Not to be closed until arm64 is a live deliverable\n- [ ] Cross-reference here when the priority flips","status":"in_progress","priority":4,"issue_type":"feature","assignee":"charlie","created_at":"2026-04-18T21:49:47.917666333Z","created_by":"coding","updated_at":"2026-04-19T00:58:19.767272778Z","source_repo":".","compaction_level":0,"original_size":0,"labels":["open-problem","phase-12","roadmap"],"dependencies":[{"issue_id":"miroir-zc2.6","depends_on_id":"miroir-zc2","type":"parent-child","created_at":"2026-04-18T21:49:47.917666333Z","created_by":"coding","metadata":"{}","thread_id":""}]} +{"id":"miroir-zfo","title":"P12.OP4 follow-up: Validate RRF merging quality with score-comparability benchmark","description":"## Context\n\nScore normalization research (miroir-zc2.4) found that raw _rankingScore merging gives Kendall τ = 0.79 vs ground truth — well below the 0.95 threshold. RRF merging is already implemented in merger.rs as the mitigation.\n\n## What\n\nRe-run the score-comparability benchmark using Miroir's actual RRF merger (instead of the score-based merge in simulate.py) and measure τ against ground truth. This validates that RRF solves the cross-shard comparability problem.\n\n## Steps\n1. Add an RRF merge mode to simulate.py (or write a Rust test that uses the actual merger)\n2. Re-run with the same 10K query set against the skewed corpus\n3. Measure Kendall τ between RRF-merged results and single-index ground truth\n4. If τ ≥ 0.95: close with note-of-no-action\n5. If τ < 0.95: investigate global-IDF preflight (plan §1 dfs_query_then_fetch pattern)\n\n## Acceptance\n- [ ] RRF merge benchmarked against ground truth\n- [ ] τ reported with 95% CI\n- [ ] If τ < 0.95: create bead for global-IDF preflight implementation","status":"in_progress","priority":2,"issue_type":"issue","assignee":"alpha","created_at":"2026-04-19T04:06:52.077073258Z","created_by":"coding","updated_at":"2026-04-19T04:08:17.099871121Z","source_repo":".","compaction_level":0,"original_size":0} diff --git a/.beads/traces/miroir-qon.1/metadata.json b/.beads/traces/miroir-qon.1/metadata.json new file mode 100644 index 0000000..1053a40 --- /dev/null +++ b/.beads/traces/miroir-qon.1/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.1", + "agent": "claude-code-glm-4.7", + "provider": "zai", + "model": "glm-4.7", + "exit_code": 0, + "outcome": "success", + "duration_ms": 13367, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:50:15.822899604Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.1/stderr.txt b/.beads/traces/miroir-qon.1/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.1/stdout.txt b/.beads/traces/miroir-qon.1/stdout.txt new file mode 100644 index 0000000..654fe55 --- /dev/null +++ b/.beads/traces/miroir-qon.1/stdout.txt @@ -0,0 +1,3 @@ +```json +{"splittable": false} +``` diff --git a/.beads/traces/miroir-qon.2/metadata.json b/.beads/traces/miroir-qon.2/metadata.json new file mode 100644 index 0000000..9f3a332 --- /dev/null +++ b/.beads/traces/miroir-qon.2/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.2", + "agent": "claude-code-glm-4.7", + "provider": "zai", + "model": "glm-4.7", + "exit_code": 0, + "outcome": "success", + "duration_ms": 16071, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:51:19.041392499Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.2/stderr.txt b/.beads/traces/miroir-qon.2/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.2/stdout.txt b/.beads/traces/miroir-qon.2/stdout.txt new file mode 100644 index 0000000..fda8172 --- /dev/null +++ b/.beads/traces/miroir-qon.2/stdout.txt @@ -0,0 +1 @@ +{"splittable": false} diff --git a/.beads/traces/miroir-qon.3/metadata.json b/.beads/traces/miroir-qon.3/metadata.json new file mode 100644 index 0000000..40478b4 --- /dev/null +++ b/.beads/traces/miroir-qon.3/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.3", + "agent": "claude-code-glm-4.7", + "provider": "zai", + "model": "glm-4.7", + "exit_code": 0, + "outcome": "success", + "duration_ms": 13795, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:49:39.940334365Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.3/stderr.txt b/.beads/traces/miroir-qon.3/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.3/stdout.txt b/.beads/traces/miroir-qon.3/stdout.txt new file mode 100644 index 0000000..fda8172 --- /dev/null +++ b/.beads/traces/miroir-qon.3/stdout.txt @@ -0,0 +1 @@ +{"splittable": false} diff --git a/.beads/traces/miroir-qon.4/metadata.json b/.beads/traces/miroir-qon.4/metadata.json new file mode 100644 index 0000000..e949401 --- /dev/null +++ b/.beads/traces/miroir-qon.4/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.4", + "agent": "claude-code-glm-4.7", + "provider": "zai", + "model": "glm-4.7", + "exit_code": 0, + "outcome": "success", + "duration_ms": 10715, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:50:28.482335907Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.4/stderr.txt b/.beads/traces/miroir-qon.4/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.4/stdout.txt b/.beads/traces/miroir-qon.4/stdout.txt new file mode 100644 index 0000000..654fe55 --- /dev/null +++ b/.beads/traces/miroir-qon.4/stdout.txt @@ -0,0 +1,3 @@ +```json +{"splittable": false} +``` diff --git a/.beads/traces/miroir-qon.5/metadata.json b/.beads/traces/miroir-qon.5/metadata.json new file mode 100644 index 0000000..e92c9e7 --- /dev/null +++ b/.beads/traces/miroir-qon.5/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.5", + "agent": "claude-code-glm-5-1", + "provider": "zai", + "model": "glm-5.1", + "exit_code": 0, + "outcome": "success", + "duration_ms": 19143, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:55:42.734289546Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.5/stderr.txt b/.beads/traces/miroir-qon.5/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.5/stdout.txt b/.beads/traces/miroir-qon.5/stdout.txt new file mode 100644 index 0000000..fda8172 --- /dev/null +++ b/.beads/traces/miroir-qon.5/stdout.txt @@ -0,0 +1 @@ +{"splittable": false} diff --git a/.beads/traces/miroir-qon.6/metadata.json b/.beads/traces/miroir-qon.6/metadata.json new file mode 100644 index 0000000..1543808 --- /dev/null +++ b/.beads/traces/miroir-qon.6/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.6", + "agent": "claude-code-glm-5-1", + "provider": "zai", + "model": "glm-5.1", + "exit_code": 0, + "outcome": "success", + "duration_ms": 133404, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:48:22.952561028Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.6/stderr.txt b/.beads/traces/miroir-qon.6/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.6/stdout.txt b/.beads/traces/miroir-qon.6/stdout.txt new file mode 100644 index 0000000..8b79308 --- /dev/null +++ b/.beads/traces/miroir-qon.6/stdout.txt @@ -0,0 +1 @@ +Done. Created `LICENSE` (MIT), `CHANGELOG.md` (Keep a Changelog 1.1.0 skeleton), and `.gitignore` (Rust + editor junk). All acceptance criteria verified — awk extractor returns content for `[0.1.0]`, `Cargo.lock` stays in VCS. Root commit `409f952` initialized the git repo. Bead `miroir-qon.6` closed. diff --git a/.beads/traces/miroir-qon.7/metadata.json b/.beads/traces/miroir-qon.7/metadata.json new file mode 100644 index 0000000..a4737eb --- /dev/null +++ b/.beads/traces/miroir-qon.7/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-qon.7", + "agent": "claude-code-glm-5-1", + "provider": "zai", + "model": "glm-5.1", + "exit_code": 0, + "outcome": "success", + "duration_ms": 12793, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:55:03.752087580Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-qon.7/stderr.txt b/.beads/traces/miroir-qon.7/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-qon.7/stdout.txt b/.beads/traces/miroir-qon.7/stdout.txt new file mode 100644 index 0000000..fda8172 --- /dev/null +++ b/.beads/traces/miroir-qon.7/stdout.txt @@ -0,0 +1 @@ +{"splittable": false} diff --git a/.beads/traces/miroir-zc2.1/metadata.json b/.beads/traces/miroir-zc2.1/metadata.json new file mode 100644 index 0000000..b0b761e --- /dev/null +++ b/.beads/traces/miroir-zc2.1/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-zc2.1", + "agent": "claude-code-glm-5", + "provider": "zai", + "model": "glm-5", + "exit_code": 124, + "outcome": "timeout", + "duration_ms": 600001, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T00:57:22.426635970Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-zc2.1/stderr.txt b/.beads/traces/miroir-zc2.1/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-zc2.1/stdout.txt b/.beads/traces/miroir-zc2.1/stdout.txt new file mode 100644 index 0000000..9fc7a6a --- /dev/null +++ b/.beads/traces/miroir-zc2.1/stdout.txt @@ -0,0 +1 @@ +Error: Reached max turns (30) diff --git a/.beads/traces/miroir-zc2.2/metadata.json b/.beads/traces/miroir-zc2.2/metadata.json new file mode 100644 index 0000000..0c1271e --- /dev/null +++ b/.beads/traces/miroir-zc2.2/metadata.json @@ -0,0 +1,16 @@ +{ + "bead_id": "miroir-zc2.2", + "agent": "claude-code-glm-5", + "provider": "zai", + "model": "glm-5", + "exit_code": 124, + "outcome": "timeout", + "duration_ms": 600002, + "input_tokens": null, + "output_tokens": null, + "cost_usd": null, + "captured_at": "2026-04-19T01:02:21.906008483Z", + "trace_format": "claude_json", + "pruned": false, + "template_version": null +} \ No newline at end of file diff --git a/.beads/traces/miroir-zc2.2/stderr.txt b/.beads/traces/miroir-zc2.2/stderr.txt new file mode 100644 index 0000000..e69de29 diff --git a/.beads/traces/miroir-zc2.2/stdout.txt b/.beads/traces/miroir-zc2.2/stdout.txt new file mode 100644 index 0000000..e07ac7a --- /dev/null +++ b/.beads/traces/miroir-zc2.2/stdout.txt @@ -0,0 +1,3 @@ +Done. Research doc published at `docs/research/raft-task-store.md`, committed as `fe274a5`, pushed to master, bead `miroir-zc2.2` closed. + +**Decision: Revisit before v2.0, do not ship in v0.x or v1.0.** Raft fails the plan's decision gate — it's 3–8x slower on writes, adds 90–185 MB per pod, and has lower correctness maturity than Redis. The one genuine win (no external dependency) doesn't compensate for the regression on the other metrics. The hybrid approach (all three backends behind the same `TaskStore` trait) is documented as the migration path if we decide to ship Raft later. diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..1158635 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,3 @@ +[env] +CC_x86_64_unknown_linux_musl = "gcc" +CFLAGS_x86_64_unknown_linux_musl = "-static" diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 350fcc6..fc32f4c 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -2b1ea87f3e20825ca10ab4a4bea83e1a30bb7800 +47d586cc61c5c7c8051a4f3ea41040ddf9a2479e diff --git a/Cargo.lock b/Cargo.lock index 233e40e..a140074 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,6 +29,12 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "1.0.0" @@ -108,6 +114,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "axum" version = "0.7.9" @@ -195,6 +207,21 @@ dependencies = [ "virtue", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.11.1" @@ -225,6 +252,12 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.2.60" @@ -247,6 +280,33 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clap" version = "4.6.1" @@ -350,6 +410,67 @@ dependencies = [ "libc", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crunchy" version = "0.2.4" @@ -423,6 +544,12 @@ dependencies = [ "const-random", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -448,6 +575,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fastrand" version = "2.4.1" @@ -472,6 +611,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -496,6 +641,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-task" version = "0.3.32" @@ -509,6 +665,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-core", + "futures-macro", "futures-task", "pin-project-lite", "slab", @@ -564,6 +721,17 @@ dependencies = [ "wasip3", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -580,7 +748,16 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", ] [[package]] @@ -598,12 +775,27 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "hashlink" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea0b22561a9c04a7cb1a302c013e0259cd3b4bb619f145b32f72b8b4bcbed230" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "http" version = "1.4.0" @@ -846,12 +1038,32 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -908,6 +1120,17 @@ dependencies = [ "libc", ] +[[package]] +name = "libsqlite3-sys" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f111c8c41e7c61a49cd34e44c7619462967221a6443b0ec299e0ac30cfb9b1" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -991,10 +1214,16 @@ version = "0.1.0" dependencies = [ "bincode", "config", + "criterion", + "futures-util", + "proptest", + "rusqlite", "serde", "serde_json", "serde_yaml", + "tempfile", "thiserror 2.0.18", + "tokio", "tracing", "twox-hash", "uuid", @@ -1055,6 +1284,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -1067,6 +1305,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + [[package]] name = "option-ext" version = "0.2.0" @@ -1167,6 +1411,40 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -1229,12 +1507,37 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "protobuf" version = "2.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quinn" version = "0.11.9" @@ -1340,6 +1643,35 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1360,6 +1692,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.14" @@ -1441,6 +1785,31 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "rsqlite-vfs" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a1f2315036ef6b1fbacd1972e8ee7688030b0a2121edfc2a6550febd41574d" +dependencies = [ + "hashbrown 0.16.1", + "thiserror 2.0.18", +] + +[[package]] +name = "rusqlite" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2b0146dd9661bf67bb107c0bb2a55064d556eeb3fc314151b957f313bcd4e" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink 0.11.0", + "libsqlite3-sys", + "smallvec", + "sqlite-wasm-rs", +] + [[package]] name = "rust-ini" version = "0.20.0" @@ -1511,12 +1880,33 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1675,6 +2065,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "sqlite-wasm-rs" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b2c760607300407ddeaee518acf28c795661b7108c75421303dbefb237d3a36" +dependencies = [ + "cc", + "js-sys", + "rsqlite-vfs", + "wasm-bindgen", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -1805,6 +2207,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.11.0" @@ -2034,6 +2446,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -2112,6 +2530,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -2124,6 +2548,25 @@ version = "0.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2275,6 +2718,15 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -2546,7 +2998,7 @@ checksum = "8902160c4e6f2fb145dbe9d6760a75e3c9522d8bf796ed7047c85919ac7115f8" dependencies = [ "arraydeque", "encoding_rs", - "hashlink", + "hashlink 0.8.4", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6766c58..459a577 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ serde_json = "1.0" thiserror = "2.0" tracing = "0.1" pretty_assertions = "1.4" +rusqlite = { version = "0.39", features = ["bundled"] } diff --git a/crates/miroir-core/Cargo.toml b/crates/miroir-core/Cargo.toml index aec6f57..aadb079 100644 --- a/crates/miroir-core/Cargo.toml +++ b/crates/miroir-core/Cargo.toml @@ -16,6 +16,7 @@ tracing = { workspace = true } uuid = { version = "1", features = ["v4", "serde"] } config = "0.14" rusqlite = { workspace = true } +futures-util = "0.3" # Raft prototype (P12.OP2 research) — not for production use # openraft 0.9.22 fails on stable Rust 1.87 (validit uses let_chains). @@ -44,3 +45,4 @@ harness = false tempfile = "3" proptest = "1" criterion = "0.5" +tokio = { version = "1", features = ["rt", "macros", "time"] } diff --git a/crates/miroir-core/src/merger.rs b/crates/miroir-core/src/merger.rs index a6cac63..5cc024f 100644 --- a/crates/miroir-core/src/merger.rs +++ b/crates/miroir-core/src/merger.rs @@ -1,74 +1,843 @@ //! Result merger: combines shard results into a single response. use crate::Result; -use serde_json::Value; +use serde_json::{Map, Value}; +use std::collections::BTreeMap; +use std::cmp::Ordering; -/// Result merger: combines responses from multiple shards. -pub trait Merger: Send + Sync { - /// Merge search results from multiple shards. - /// - /// Takes the raw JSON responses from each shard and produces - /// a merged result with global sorting, offset/limit applied, - /// and facet aggregation. - fn merge( - &self, - shard_responses: Vec, - offset: usize, - limit: usize, - client_requested_score: bool, - ) -> Result; +/// Input to the merge operation. +#[derive(Debug, Clone)] +pub struct MergeInput { + /// One response page per node in the covering set. + pub shard_hits: Vec, + + /// Original offset from the client request. + pub offset: usize, + + /// Original limit from the client request. + pub limit: usize, + + /// Whether the client requested scores in the response. + pub client_requested_score: bool, + + /// Facet names requested (for filtering which facets to return). + pub facets: Option>, } -/// Response from a single shard. +/// Response from a single shard (node). #[derive(Debug, Clone)] -pub struct ShardResponse { - /// Shard identifier. - pub shard_id: u32, - +pub struct ShardHitPage { /// Raw JSON response from the node. pub body: Value, - - /// Whether this shard succeeded. - pub success: bool, } /// Merged search result. -#[derive(Debug, Clone)] -pub struct MergedResult { +#[derive(Debug, Clone, serde::Serialize)] +pub struct MergedSearchResult { /// Merged hits (globally sorted, offset/limit applied). pub hits: Vec, - /// Aggregated facets. - pub facets: Value, + /// Aggregated facet distribution. + pub facet_distribution: Option>>, /// Estimated total hits (sum of shard totals). - pub total_hits: u64, + pub estimated_total_hits: u64, - /// Processing time in milliseconds. + /// Processing time in milliseconds (max across covering set). pub processing_time_ms: u64, - /// Whether the response is degraded (some shards failed). + /// Whether the response is degraded (some shards had errors). pub degraded: bool, } -/// Default stub implementation of Merger. -#[derive(Debug, Clone, Default)] -pub struct StubMerger; +/// RRF constant k. +/// +/// This is the denominator constant used in Reciprocal Rank Fusion. +/// The value 60 is the default recommended in the RRF literature and +/// is used by OpenSearch for hybrid search. +const RRF_K: u32 = 60; -impl Merger for StubMerger { - fn merge( - &self, - _shard_responses: Vec, - _offset: usize, - _limit: usize, - _client_requested_score: bool, - ) -> Result { - Ok(MergedResult { - hits: Vec::new(), - facets: serde_json::json!({}), - total_hits: 0, - processing_time_ms: 0, - degraded: false, - }) +/// A document with its accumulated RRF score. +#[derive(Debug, Clone)] +struct RRFDocument { + /// Accumulated RRF score across all shards. + rrf_score: f64, + + /// Primary key for tie-breaking. + primary_key: String, + + /// The hit document (JSON object) from the highest-ranking shard. + hit: Map, +} + +impl PartialEq for RRFDocument { + fn eq(&self, other: &Self) -> bool { + self.rrf_score == other.rrf_score && self.primary_key == other.primary_key + } +} + +impl Eq for RRFDocument {} + +impl PartialOrd for RRFDocument { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for RRFDocument { + fn cmp(&self, other: &Self) -> Ordering { + // Primary sort: RRF score descending (higher score = better rank) + match self.rrf_score.partial_cmp(&other.rrf_score) { + Some(Ordering::Equal) => { + // Secondary sort: primary key ascending for deterministic tie-breaking + self.primary_key.cmp(&other.primary_key) + } + Some(ord) => ord.reverse(), + None => { + // NaN case: treat as lowest score + if self.rrf_score.is_nan() && !other.rrf_score.is_nan() { + Ordering::Less + } else if !self.rrf_score.is_nan() && other.rrf_score.is_nan() { + Ordering::Greater + } else { + Ordering::Equal + } + } + } + } +} + +/// Merge search results from multiple shards into a single response. +/// +/// This is a pure function with no side effects, making it testable +/// without a network and ensuring deterministic output. +pub fn merge(input: MergeInput) -> Result { + let mut estimated_total_hits = 0u64; + let mut max_processing_time = 0u64; + let mut degraded = false; + + // Collect all hits with their ranks from all shards. + // Use a map to aggregate RRF scores for documents appearing in multiple shards. + let mut rrf_map: std::collections::HashMap = std::collections::HashMap::new(); + + for shard_page in &input.shard_hits { + let body = &shard_page.body; + + // Check for degraded response. + if let Some(serde_json::Value::Bool(false)) = body.get("success") { + degraded = true; + continue; + } + + // Extract estimated total hits. + if let Some(Value::Number(n)) = body.get("estimatedTotalHits") { + if let Some(n) = n.as_u64() { + estimated_total_hits = estimated_total_hits.saturating_add(n); + } + } + + // Extract processing time. + if let Some(Value::Number(n)) = body.get("processingTimeMs") { + if let Some(n) = n.as_u64() { + max_processing_time = max_processing_time.max(n); + } + } + + // Extract hits with ranks (position in shard's results). + if let Some(Value::Array(hits)) = body.get("hits") { + for (rank, hit) in hits.iter().enumerate() { + if let Value::Object(ref map) = hit { + let map = map.clone(); + + // Extract primary key for deduplication. + let primary_key = map + .get("id") + .or_else(|| map.get("pk")) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + // Compute RRF contribution: 1 / (k + rank) + // rank is 0-based, so we add 1 to convert to 1-based for RRF formula + let rrf_contribution = 1.0 / ((RRF_K as f64) + (rank as f64) + 1.0); + + // Aggregate RRF scores across shards. + use std::collections::hash_map::Entry; + match rrf_map.entry(primary_key.clone()) { + Entry::Vacant(e) => { + e.insert(RRFDocument { + rrf_score: rrf_contribution, + primary_key, + hit: map, + }); + } + Entry::Occupied(mut e) => { + // Document appears in multiple shards: sum RRF contributions. + e.get_mut().rrf_score += rrf_contribution; + } + } + } + } + } + } + + // Convert map to vec and sort by RRF score descending. + let mut merged_docs: Vec<_> = rrf_map.into_values().collect(); + merged_docs.sort(); + + // Apply offset + limit. + let skip = input.offset; + let take = input.limit; + let paginated_hits: Vec<_> = merged_docs + .into_iter() + .skip(skip) + .take(take) + .collect(); + + // Strip reserved fields and rebuild hits. + let mut hits = Vec::with_capacity(paginated_hits.len()); + for mut doc in paginated_hits { + // Strip _rankingScore if not requested (RRF doesn't use original scores). + if !input.client_requested_score { + doc.hit.remove("_rankingScore"); + } + + // Always strip _miroir_* fields. + doc.hit.retain(|k, _| !k.starts_with("_miroir_")); + + hits.push(Value::Object(doc.hit)); + } + + // Merge facets. + let facet_distribution = merge_facets(&input.shard_hits, input.facets.as_deref()); + + Ok(MergedSearchResult { + hits, + facet_distribution, + estimated_total_hits, + processing_time_ms: max_processing_time, + degraded, + }) +} + +/// Merge facet distributions from multiple shards. +/// +/// Uses BTreeMap for stable ordering (deterministic serialization). +fn merge_facets( + shard_pages: &[ShardHitPage], + requested_facets: Option<&[String]>, +) -> Option>> { + let mut merged: BTreeMap> = BTreeMap::new(); + + for shard_page in shard_pages { + let body = &shard_page.body; + + // Meilisearch uses "facetDistribution" for facet results. + if let Some(Value::Object(facets)) = body.get("facetDistribution") { + for (facet_name, facet_values) in facets { + // Skip if not requested (if a filter was provided). + if let Some(requested) = requested_facets { + if !requested.iter().any(|f| f == facet_name) { + continue; + } + } + + if let Value::Object(values_map) = facet_values { + let merged_facet = merged.entry(facet_name.clone()).or_default(); + for (value, count) in values_map { + if let Value::Number(n) = count { + if let Some(n) = n.as_u64() { + *merged_facet.entry(value.clone()).or_insert(0) += n; + } + } + } + } + } + } + } + + if merged.is_empty() { + None + } else { + Some(merged) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn make_hit(id: &str, score: f64, shard: u32) -> Value { + json!({ + "id": id, + "title": format!("Document {}", id), + "_rankingScore": score, + "_miroir_shard": shard, + }) + } + + /// Make a hit without score (for testing RRF rank-based ordering). + fn make_hit_ranked(id: &str, shard: u32) -> Value { + json!({ + "id": id, + "title": format!("Document {}", id), + "_rankingScore": 0.5, // RRF ignores score, only rank matters + "_miroir_shard": shard, + }) + } + + fn make_shard_response( + hits: Vec, + total_hits: u64, + processing_time: u64, + ) -> ShardHitPage { + ShardHitPage { + body: json!({ + "hits": hits, + "estimatedTotalHits": total_hits, + "processingTimeMs": processing_time, + "facetDistribution": {}, + }), + } + } + + #[test] + fn test_merge_basic() { + let input = MergeInput { + shard_hits: vec![make_shard_response( + vec![ + make_hit("doc1", 0.9, 0), + make_hit("doc2", 0.7, 0), + ], + 100, + 15, + )], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!(result.hits.len(), 2); + assert_eq!(result.estimated_total_hits, 100); + assert_eq!(result.processing_time_ms, 15); + assert!(!result.degraded); + + // Score should be stripped. + let hit = &result.hits[0]; + assert!(hit.get("_rankingScore").is_none()); + assert!(hit.get("_miroir_shard").is_none()); + assert_eq!(hit.get("id").unwrap(), "doc1"); + } + + #[test] + fn test_merge_global_sort() { + let input = MergeInput { + shard_hits: vec![ + make_shard_response(vec![make_hit("doc1", 0.5, 0)], 50, 10), + make_shard_response(vec![make_hit("doc2", 0.9, 1)], 50, 10), + make_shard_response(vec![make_hit("doc3", 0.7, 2)], 50, 10), + ], + offset: 0, + limit: 10, + client_requested_score: true, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!(result.hits.len(), 3); + + // RRF: all docs are at rank 0 in their respective shards. + // With equal ranks, tie-break by primary key (alphabetically). + assert_eq!(result.hits[0].get("id").unwrap(), "doc1"); + assert_eq!(result.hits[1].get("id").unwrap(), "doc2"); + assert_eq!(result.hits[2].get("id").unwrap(), "doc3"); + } + + #[test] + fn test_merge_rrf_rank_ordering() { + // Test RRF: higher rank (lower position) contributes more to RRF score. + // shard0: [low_rank_doc (rank 0), mid_rank_doc (rank 10)] + // shard1: [high_rank_doc (rank 0)] + // shard2: [mid_rank_doc (rank 0) - same doc appears in two shards!] + // + // Expected RRF scores: + // - low_rank_doc: 1/(60+0+1) = 1/61 (only in shard0) + // - high_rank_doc: 1/(60+0+1) = 1/61 (only in shard1) + // - mid_rank_doc: 1/(60+10+1) + 1/(60+0+1) = 1/71 + 1/61 (rank 10 in shard0, rank 0 in shard2) + // + // mid_rank_doc should win because it appears in multiple shards. + + let mut shard0_hits = vec![]; + let mut shard1_hits = vec![]; + let mut shard2_hits = vec![]; + + // Build shard0: low_rank_doc at position 0, mid_rank_doc at position 10 + shard0_hits.push(make_hit("low_rank_doc", 0.1, 0)); + for i in 0..9 { + shard0_hits.push(make_hit(&format!("filler_0_{}", i), 0.5, 0)); + } + shard0_hits.push(make_hit("mid_rank_doc", 0.2, 0)); + + // shard1: high_rank_doc at position 0 + shard1_hits.push(make_hit("high_rank_doc", 0.3, 1)); + + // shard2: mid_rank_doc at position 0 (same doc appears again!) + shard2_hits.push(make_hit("mid_rank_doc", 0.4, 2)); + + let input = MergeInput { + shard_hits: vec![ + make_shard_response(shard0_hits, 100, 10), + make_shard_response(shard1_hits, 100, 10), + make_shard_response(shard2_hits, 100, 10), + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + + // mid_rank_doc should be first because it appears in multiple shards + // (RRF contributions sum: rank 10 in shard0 + rank 0 in shard2) + assert_eq!(result.hits[0].get("id").unwrap(), "mid_rank_doc"); + + // low_rank_doc and high_rank_doc both at rank 0 in their shards + // Tie-break by primary key alphabetically + assert_eq!(result.hits[1].get("id").unwrap(), "high_rank_doc"); + assert_eq!(result.hits[2].get("id").unwrap(), "low_rank_doc"); + } + + #[test] + fn test_merge_rrf_duplicate_handling() { + // Test that the same document appearing in multiple shards + // gets its RRF score summed. + // + // doc1 appears at rank 0 in shard0 and rank 5 in shard1 + // doc2 appears at rank 0 in shard2 + // + // RRF(doc1) = 1/(60+0+1) + 1/(60+5+1) = 1/61 + 1/66 + // RRF(doc2) = 1/(60+0+1) = 1/61 + // + // doc1 should rank higher. + + let shard0_hits = vec![make_hit("doc1", 0.1, 0)]; + let mut shard1_hits = vec![]; + let shard2_hits = vec![make_hit("doc2", 0.9, 2)]; + + // Add filler hits to shard1 to make doc1 appear at rank 5 + for i in 0..5 { + shard1_hits.push(make_hit(&format!("filler_1_{}", i), 0.5, 1)); + } + shard1_hits.push(make_hit("doc1", 0.2, 1)); + + let input = MergeInput { + shard_hits: vec![ + make_shard_response(shard0_hits, 50, 10), + make_shard_response(shard1_hits, 50, 10), + make_shard_response(shard2_hits, 50, 10), + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + + // doc1 should rank higher despite lower score because + // it appears in multiple shards and gets RRF contribution boost + assert_eq!(result.hits[0].get("id").unwrap(), "doc1"); + assert_eq!(result.hits[1].get("id").unwrap(), "doc2"); + } + + #[test] + fn test_merge_offset_limit() { + let input = MergeInput { + shard_hits: vec![make_shard_response( + vec![ + make_hit("doc1", 0.9, 0), + make_hit("doc2", 0.8, 0), + make_hit("doc3", 0.7, 0), + make_hit("doc4", 0.6, 0), + make_hit("doc5", 0.5, 0), + ], + 100, + 10, + )], + offset: 1, + limit: 2, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!(result.hits.len(), 2); + assert_eq!(result.hits[0].get("id").unwrap(), "doc2"); + assert_eq!(result.hits[1].get("id").unwrap(), "doc3"); + } + + #[test] + fn test_merge_preserves_score_when_requested() { + // RRF doesn't use scores for ranking, but we still preserve + // the original score field when requested. + let input = MergeInput { + shard_hits: vec![make_shard_response( + vec![make_hit("doc1", 0.9, 0)], + 50, + 10, + )], + offset: 0, + limit: 10, + client_requested_score: true, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!( + result.hits[0].get("_rankingScore").unwrap().as_f64(), + Some(0.9) + ); + } + + #[test] + fn test_merge_rrf_ignores_scores() { + // Test that RRF ordering is based on rank, not score. + // Even though doc3 has highest score, it's ranked lower in its shard. + // + // shard0: doc2 at rank 0 with score 0.9 (same rank, tie-break by id) + // shard1: doc1 at rank 0 with score 0.1 (same rank, tie-break by id) + // shard2: doc3 at rank 2 (position 2) with score 1.0 (lower rank) + // + // RRF scores: + // - doc1: 1/61 (rank 0) + // - doc2: 1/61 (rank 0) + // - filler: 1/62 (rank 1) + // - doc3: 1/63 (rank 2) + // + // Ordering: doc1, doc2 (tie-break alphabetically), filler, then doc3 + + let shard0_hits = vec![make_hit("doc2", 0.9, 0)]; // High score, rank 0 + let shard1_hits = vec![make_hit("doc1", 0.1, 0)]; // Low score, rank 0 + let shard2_hits = vec![ + make_hit("filler", 0.5, 2), + make_hit("filler2", 0.5, 2), + make_hit("doc3", 1.0, 2), // Highest score, but rank 2 + ]; + + let input = MergeInput { + shard_hits: vec![ + make_shard_response(shard0_hits, 50, 10), + make_shard_response(shard1_hits, 50, 10), + make_shard_response(shard2_hits, 50, 10), + ], + offset: 0, + limit: 10, + client_requested_score: true, + facets: None, + }; + + let result = merge(input).unwrap(); + + // doc1 and doc2 both at rank 0, tie-break alphabetically + assert_eq!(result.hits[0].get("id").unwrap(), "doc1"); + assert_eq!(result.hits[1].get("id").unwrap(), "doc2"); + // filler and filler2 at rank 1 + assert_eq!(result.hits[2].get("id").unwrap(), "filler"); + assert_eq!(result.hits[3].get("id").unwrap(), "filler2"); + // doc3 at rank 2, comes last despite highest score + assert_eq!(result.hits[4].get("id").unwrap(), "doc3"); + } + + #[test] + fn test_merge_strips_miroir_fields() { + let input = MergeInput { + shard_hits: vec![make_shard_response( + vec![json!({ + "id": "doc1", + "title": "Test", + "_rankingScore": 0.9, + "_miroir_shard": 0, + "_miroir_node": "node-1", + "_miroir_group": 1, + })], + 50, + 10, + )], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + let hit = &result.hits[0]; + + assert!(hit.get("_rankingScore").is_none()); + assert!(hit.get("_miroir_shard").is_none()); + assert!(hit.get("_miroir_node").is_none()); + assert!(hit.get("_miroir_group").is_none()); + + // Non-reserved fields preserved. + assert_eq!(hit.get("id").unwrap(), "doc1"); + assert_eq!(hit.get("title").unwrap(), "Test"); + } + + #[test] + fn test_merge_facets() { + let shard1 = ShardHitPage { + body: json!({ + "hits": [], + "estimatedTotalHits": 100, + "processingTimeMs": 10, + "facetDistribution": { + "category": { + "electronics": 50, + "books": 30, + }, + "brand": { + "apple": 20, + }, + }, + }), + }; + + let shard2 = ShardHitPage { + body: json!({ + "hits": [], + "estimatedTotalHits": 100, + "processingTimeMs": 15, + "facetDistribution": { + "category": { + "electronics": 40, + "clothing": 25, + }, + "brand": { + "samsung": 15, + }, + }, + }), + }; + + let input = MergeInput { + shard_hits: vec![shard1, shard2], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + let facets = result.facet_distribution.unwrap(); + + // Check category merging. + let category = facets.get("category").unwrap(); + assert_eq!(category.get("electronics"), Some(&90)); + assert_eq!(category.get("books"), Some(&30)); + assert_eq!(category.get("clothing"), Some(&25)); + + // Check brand merging. + let brand = facets.get("brand").unwrap(); + assert_eq!(brand.get("apple"), Some(&20)); + assert_eq!(brand.get("samsung"), Some(&15)); + } + + #[test] + fn test_merge_facets_filter() { + let shard = ShardHitPage { + body: json!({ + "hits": [], + "estimatedTotalHits": 100, + "processingTimeMs": 10, + "facetDistribution": { + "category": {"electronics": 50}, + "brand": {"apple": 20}, + }, + }), + }; + + let input = MergeInput { + shard_hits: vec![shard], + offset: 0, + limit: 10, + client_requested_score: false, + facets: Some(vec!["category".to_string()]), + }; + + let result = merge(input).unwrap(); + let facets = result.facet_distribution.unwrap(); + + assert!(facets.contains_key("category")); + assert!(!facets.contains_key("brand")); + } + + #[test] + fn test_merge_estimated_total_hits_sum() { + let input = MergeInput { + shard_hits: vec![ + make_shard_response(vec![], 100, 10), + make_shard_response(vec![], 150, 15), + make_shard_response(vec![], 200, 20), + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!(result.estimated_total_hits, 450); + } + + #[test] + fn test_merge_processing_time_max() { + let input = MergeInput { + shard_hits: vec![ + make_shard_response(vec![], 100, 10), + make_shard_response(vec![], 100, 25), + make_shard_response(vec![], 100, 15), + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + assert_eq!(result.processing_time_ms, 25); + } + + #[test] + fn test_merge_tie_breaking() { + let input = MergeInput { + shard_hits: vec![ + make_shard_response(vec![make_hit("zebra", 0.5, 0)], 50, 10), + make_shard_response(vec![make_hit("apple", 0.5, 1)], 50, 10), + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + // RRF: both docs at rank 0 in their shards, equal RRF scores. + // Tie-break by primary key (apple < zebra lexicographically). + assert_eq!(result.hits[0].get("id").unwrap(), "apple"); + assert_eq!(result.hits[1].get("id").unwrap(), "zebra"); + } + + #[test] + fn test_merge_degraded_flag() { + let failed_shard = ShardHitPage { + body: json!({ + "success": false, + "message": "node unavailable", + }), + }; + + let input = MergeInput { + shard_hits: vec![ + make_shard_response(vec![make_hit("doc1", 0.9, 0)], 50, 10), + failed_shard, + ], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result = merge(input).unwrap(); + assert!(result.degraded); + } + + #[test] + fn test_stable_serialization() { + let shard1 = ShardHitPage { + body: json!({ + "hits": [make_hit("doc1", 0.9, 0)], + "estimatedTotalHits": 100, + "processingTimeMs": 10, + "facetDistribution": { + "category": {"electronics": 50, "books": 30}, + }, + }), + }; + + let shard2 = ShardHitPage { + body: json!({ + "hits": [make_hit("doc2", 0.8, 1)], + "estimatedTotalHits": 100, + "processingTimeMs": 15, + "facetDistribution": { + "brand": {"apple": 20}, + }, + }), + }; + + let input = MergeInput { + shard_hits: vec![shard1.clone(), shard2.clone()], + offset: 0, + limit: 10, + client_requested_score: false, + facets: None, + }; + + let result1 = merge(input.clone()).unwrap(); + let result2 = merge(input).unwrap(); + + // Serialize both to JSON. + let json1 = serde_json::to_value(&result1).unwrap(); + let json2 = serde_json::to_value(&result2).unwrap(); + + // Byte-identical. + assert_eq!( + serde_json::to_vec(&json1).unwrap(), + serde_json::to_vec(&json2).unwrap() + ); + } + + #[test] + fn test_page_reconstruction() { + // Test that pages of 10 reconstruct a single limit=50 result. + let mut all_hits = Vec::new(); + for i in 0..50 { + all_hits.push(make_hit(&format!("doc{:02}", i), (50 - i) as f64 / 100.0, 0)); + } + + let input = MergeInput { + shard_hits: vec![make_shard_response(all_hits, 50, 10)], + offset: 0, + limit: 50, + client_requested_score: false, + facets: None, + }; + + let full_result = merge(input.clone()).unwrap(); + assert_eq!(full_result.hits.len(), 50); + + // Now fetch in pages of 10 and verify they match. + for page in 0..5 { + let page_input = MergeInput { + shard_hits: input.shard_hits.clone(), + offset: page * 10, + limit: 10, + client_requested_score: false, + facets: None, + }; + let page_result = merge(page_input).unwrap(); + + let start = page * 10; + let end = start + 10; + assert_eq!( + page_result.hits, + full_result.hits[start..end], + "Page {} mismatch", + page + ); + } } } diff --git a/crates/miroir-core/src/router.rs b/crates/miroir-core/src/router.rs index 130dbfb..b207731 100644 --- a/crates/miroir-core/src/router.rs +++ b/crates/miroir-core/src/router.rs @@ -99,7 +99,7 @@ pub fn count_assignment_diff( #[cfg(test)] mod tests { use super::*; - use crate::topology::NodeId; + use crate::topology::{Node, NodeId}; use std::collections::HashMap; /// Test 1: Determinism — same inputs always produce the same output. @@ -336,4 +336,228 @@ mod tests { // Verify determinism assert_eq!(score(0, node_a), score_0_a, "Score is non-deterministic"); } + + // ── P1.3 acceptance tests: write_targets, query_group, covering_set ─── + + /// P1.3-A1: write_targets returns exactly RG × RF nodes (counting duplicates). + #[test] + fn test_write_targets_returns_rg_x_rf_nodes() { + let mut topo = Topology::new(64, 3, 2); + // Add 5 nodes to each of 3 groups + for i in 0u32..15 { + let rg = i / 5; + topo.add_node(Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + rg, + )); + } + + let targets = write_targets(0, &topo); + // RG=3, RF=2 → 6 nodes total (may include duplicates) + assert_eq!(targets.len(), 6, "write_targets should return RG × RF nodes"); + } + + /// P1.3-A2: write_targets assigns one-per-group. + #[test] + fn test_write_targets_one_per_group() { + let mut topo = Topology::new(64, 2, 2); + // Group 0: nodes 0-2, Group 1: nodes 3-5 + for i in 0u32..6 { + let rg = if i < 3 { 0 } else { 1 }; + topo.add_node(Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + rg, + )); + } + + let shard_id = 7; + let targets = write_targets(shard_id, &topo); + + // Verify that the subset in group 0 matches assign_shard_in_group + let g0 = topo.group(0).unwrap(); + let g0_targets: Vec<_> = targets + .iter() + .filter(|n| g0.nodes().contains(n)) + .collect(); + let g0_expected = assign_shard_in_group(shard_id, g0.nodes(), 2); + assert_eq!( + g0_targets.len(), + g0_expected.len(), + "Group 0 should have exactly RF nodes" + ); + for node in &g0_expected { + assert!(g0_targets.contains(&node), "Group 0 missing expected node"); + } + + // Verify that the subset in group 1 matches assign_shard_in_group + let g1 = topo.group(1).unwrap(); + let g1_targets: Vec<_> = targets + .iter() + .filter(|n| g1.nodes().contains(n)) + .collect(); + let g1_expected = assign_shard_in_group(shard_id, g1.nodes(), 2); + assert_eq!( + g1_targets.len(), + g1_expected.len(), + "Group 1 should have exactly RF nodes" + ); + for node in &g1_expected { + assert!(g1_targets.contains(&node), "Group 1 missing expected node"); + } + } + + /// P1.3-A3: covering_set covers all shards within the chosen group. + #[test] + fn test_covering_set_covers_all_shards() { + let mut topo = Topology::new(16, 1, 2); + for i in 0u32..4 { + topo.add_node(Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + 0, + )); + } + + let group = topo.group(0).unwrap(); + let shard_count = 16; + let covering = covering_set(shard_count, group, 2, 0); + + // Verify that every shard is represented in the covering set + for shard_id in 0..shard_count { + let replicas = assign_shard_in_group(shard_id, group.nodes(), 2); + let selected = &replicas[0]; // query_seq=0 → first replica + assert!( + covering.contains(selected), + "Shard {}'s selected node {:?} not in covering set", + shard_id, + selected + ); + } + } + + /// P1.3-A4: covering_set size is bounded by Ng (nodes in group). + #[test] + fn test_covering_set_size_bound() { + let mut topo = Topology::new(1000, 1, 3); + for i in 0u32..5 { + topo.add_node(Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + 0, + )); + } + + let group = topo.group(0).unwrap(); + let ng = group.node_count(); + let covering = covering_set(1000, group, 3, 0); + + assert!( + covering.len() <= ng, + "covering_set size {} exceeds group node count {}", + covering.len(), + ng + ); + } + + /// P1.3-A5: Two identical Topologies produce identical covering_set outputs. + #[test] + fn test_covering_set_determinism() { + let mut topo1 = Topology::new(64, 2, 2); + let mut topo2 = Topology::new(64, 2, 2); + + for i in 0u32..6 { + let rg = if i < 3 { 0 } else { 1 }; + let node = Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + rg, + ); + topo1.add_node(node.clone()); + topo2.add_node(node); + } + + let g1 = topo1.group(0).unwrap(); + let g2 = topo2.group(0).unwrap(); + + for query_seq in 0..10 { + let c1 = covering_set(64, g1, 2, query_seq); + let c2 = covering_set(64, g2, 2, query_seq); + // Compare as sets since order may vary due to HashSet iteration + let s1: std::collections::HashSet<_> = c1.into_iter().collect(); + let s2: std::collections::HashSet<_> = c2.into_iter().collect(); + assert_eq!( + s1, s2, + "covering_set differs for identical topologies at query_seq={}", + query_seq + ); + } + } + + /// P1.3-A6: query_group distribution is uniform (chi-square test). + #[test] + fn test_query_group_uniform_distribution() { + let replica_groups = 5u32; + let samples = 10_000; + + let mut counts = vec![0usize; replica_groups as usize]; + for query_seq in 0..samples { + let g = query_group(query_seq as u64, replica_groups); + counts[g as usize] += 1; + } + + // Expected count per group: samples / RG + let expected = samples as f64 / replica_groups as f64; + + // Chi-square statistic: sum((observed - expected)^2 / expected) + let chi_square: f64 = counts + .iter() + .map(|&observed| { + let diff = observed as f64 - expected; + (diff * diff) / expected + }) + .sum(); + + // Degrees of freedom = RG - 1 = 4 + // Critical value at p=0.95 is ~9.49 + let critical_value = 9.49; + + assert!( + chi_square < critical_value, + "query_group distribution not uniform: chi-square={} > {}", + chi_square, + critical_value + ); + } + + /// P1.3-A7: covering_set rotates replicas by query_seq. + #[test] + fn test_covering_set_rotates_replicas() { + let mut topo = Topology::new(8, 1, 3); + for i in 0u32..4 { + topo.add_node(Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + 0, + )); + } + + let group = topo.group(0).unwrap(); + let c0 = covering_set(8, group, 3, 0); + let c1 = covering_set(8, group, 3, 1); + let c2 = covering_set(8, group, 3, 2); + + // For each shard, verify that the selected node rotates + for shard_id in 0..8 { + let replicas = assign_shard_in_group(shard_id, group.nodes(), 3); + let r0 = &replicas[0]; + let r1 = &replicas[1]; + let r2 = &replicas[2]; + + assert!(c0.contains(r0), "query_seq=0 should select first replica"); + assert!(c1.contains(r1), "query_seq=1 should select second replica"); + assert!(c2.contains(r2), "query_seq=2 should select third replica"); + } + } } diff --git a/crates/miroir-core/src/scatter.rs b/crates/miroir-core/src/scatter.rs index 54f97b2..e001333 100644 --- a/crates/miroir-core/src/scatter.rs +++ b/crates/miroir-core/src/scatter.rs @@ -1,81 +1,548 @@ //! Scatter orchestration: fan-out logic and covering set builder. use crate::config::UnavailableShardPolicy; +use crate::merger::ShardHitPage; +use crate::router::{covering_set, query_group}; use crate::topology::{NodeId, Topology}; use crate::Result; +use serde_json::Value; +use std::collections::HashMap; -/// Scatter orchestrator: fans out requests to the covering set. -pub trait Scatter: Send + Sync { - /// Execute a scatter request to multiple nodes. +/// Scatter plan: the exact shard→node mapping for a search query. +/// +/// Separating the plan from execution makes §13.20 `/explain` cheap — +/// the explain path generates the plan and returns it without touching any node. +#[derive(Debug, Clone)] +pub struct ScatterPlan { + /// Chosen replica group for this query (query_seq % RG). + pub chosen_group: u32, + + /// Target shards to query (for §13.4 narrowing — initially all 0..S). + pub target_shards: Vec, + + /// Resolved covering set: shard ID → node ID. + pub shard_to_node: HashMap, + + /// Deadline for the query in milliseconds. + pub deadline_ms: u32, + + /// Whether hedging is eligible (reserved for §13.2 Phase 5). + pub hedging_eligible: bool, +} + +/// HTTP client for communicating with a Meilisearch node. +/// +/// This is the seam between `miroir-core` (pure, no network) and +/// `miroir-proxy` (HTTP client). Injecting it via a trait means unit tests +/// can provide a fake client; production binds `reqwest` via the trait impl. +pub trait NodeClient: Send + Sync { + /// Execute a search request on a single node. /// - /// Returns a map of node ID to response. Failed nodes are omitted - /// based on the unavailable shard policy. - fn scatter( + /// Returns the raw JSON response from the node. + async fn search_node( &self, - topology: &Topology, - nodes: Vec, - request: ScatterRequest, - policy: UnavailableShardPolicy, - ) -> Result; + node: &NodeId, + address: &str, + request: &SearchRequest, + ) -> std::result::Result; } -/// A scatter request to be sent to each node. +/// Error from a single node during scatter. #[derive(Debug, Clone)] -pub struct ScatterRequest { - /// Request body (JSON or raw bytes). - pub body: Vec, - - /// Request headers. - pub headers: Vec<(String, String)>, - - /// HTTP method. - pub method: String, - - /// Request path. - pub path: String, +pub enum NodeError { + /// Node timed out. + Timeout, + /// Node returned an error response. + HttpError { status: u16, body: String }, + /// Network or connection error. + NetworkError(String), } -/// Response from a scatter operation. +/// A search request to be sent to each node in the covering set. #[derive(Debug, Clone)] -pub struct ScatterResponse { - /// Responses from successful nodes. - pub responses: Vec, +pub struct SearchRequest { + /// Index UID being queried. + pub index_uid: String, - /// Nodes that failed or timed out. - pub failed: Vec, + /// Search query (q parameter). + pub query: Option, + + /// Offset for pagination. + pub offset: usize, + + /// Limit for pagination. + pub limit: usize, + + /// Filter expression. + pub filter: Option, + + /// Facets to compute. + pub facets: Option>, + + /// Whether to return ranking scores. + pub ranking_score: bool, + + /// Raw JSON body for the search request (captures any other parameters). + pub body: Value, } -/// Response from a single node. -#[derive(Debug, Clone)] -pub struct NodeResponse { - /// Node that responded. - pub node_id: NodeId, +/// Result of a scatter operation. +#[derive(Debug)] +pub struct ScatterResult { + /// Responses from successfully contacted nodes. + pub shard_pages: Vec, - /// Response body. - pub body: Vec, + /// Errors from nodes that failed (shard ID → error). + pub failed_shards: HashMap, - /// HTTP status code. - pub status: u16, + /// Whether the response is partial (some shards failed). + pub partial: bool, - /// Response headers. - pub headers: Vec<(String, String)>, + /// Whether any node exceeded the deadline. + pub deadline_exceeded: bool, } -/// Default stub implementation of Scatter. -#[derive(Debug, Clone, Default)] -pub struct StubScatter; +/// Construct a scatter plan for a search query. +/// +/// This is a pure function — no async, no I/O. It selects the replica group, +/// computes the covering set, and maps each shard to its target node. +/// +/// # Arguments +/// * `topology` - Current cluster topology +/// * `query_seq` - Query sequence number for group selection and load balancing +/// * `rf` - Replication factor (redundant with topology.rf, kept for explicitness) +/// * `shard_count` - Number of shards to query (typically topology.shards) +/// +/// # Returns +/// A `ScatterPlan` containing the covering set and metadata for execution. +pub fn plan_search_scatter( + topology: &Topology, + query_seq: u64, + rf: usize, + shard_count: u32, +) -> ScatterPlan { + let chosen_group = query_group(query_seq, topology.replica_group_count()); -impl Scatter for StubScatter { - fn scatter( - &self, - _topology: &Topology, - _nodes: Vec, - _request: ScatterRequest, - _policy: UnavailableShardPolicy, - ) -> Result { - Ok(ScatterResponse { - responses: Vec::new(), - failed: Vec::new(), - }) + // Get the target group + let group = match topology.group(chosen_group) { + Some(g) => g, + None => { + // Invalid group ID — return empty plan (should not happen with valid topology) + return ScatterPlan { + chosen_group, + target_shards: Vec::new(), + shard_to_node: HashMap::new(), + deadline_ms: 0, + hedging_eligible: false, + }; + } + }; + + // Compute covering set: one node per shard within the chosen group + let _covering = covering_set(shard_count, group, rf, query_seq); + + // Build shard → node mapping + let mut shard_to_node = HashMap::new(); + for shard_id in 0..shard_count { + let replicas = crate::router::assign_shard_in_group(shard_id, group.nodes(), rf); + // Rotate through replicas for intra-group load balancing + let selected = replicas[(query_seq as usize) % replicas.len()].clone(); + shard_to_node.insert(shard_id, selected); + } + + // Initially target all shards + let target_shards: Vec = (0..shard_count).collect(); + + // Default deadline: 5 seconds (configurable in production) + let deadline_ms = 5000; + + // Hedging is eligible when we have multiple nodes in the group (reserved for §13.2) + let hedging_eligible = group.node_count() > 1; + + ScatterPlan { + chosen_group, + target_shards, + shard_to_node, + deadline_ms, + hedging_eligible, + } +} + +/// Execute a scatter operation against the covering set. +/// +/// Fans out the search request to all nodes in the plan, handling partial +/// failures according to the unavailable shard policy. +/// +/// # Arguments +/// * `plan` - Scatter plan from `plan_search_scatter` +/// * `client` - HTTP client for communicating with nodes +/// * `req` - Search request to execute +/// * `topology` - Current topology (for resolving node addresses) +/// * `policy` - Policy for handling unavailable shards +/// +/// # Returns +/// A `ScatterResult` containing successful responses and any errors. +pub async fn execute_scatter( + plan: ScatterPlan, + client: &C, + req: SearchRequest, + topology: &Topology, + policy: UnavailableShardPolicy, +) -> Result { + use std::collections::HashMap; + + // Group requests by unique node (scatter happens once per node, not per shard) + let mut node_to_shards: HashMap> = HashMap::new(); + for (&shard_id, node_id) in &plan.shard_to_node { + if plan.target_shards.contains(&shard_id) { + node_to_shards + .entry(node_id.clone()) + .or_default() + .push(shard_id); + } + } + + let mut shard_pages = Vec::new(); + let mut failed_shards = HashMap::new(); + let mut deadline_exceeded = false; + + // Execute requests in parallel (one per unique node) + let mut tasks = Vec::new(); + for (node_id, shards) in node_to_shards { + let node = match topology.node(&node_id) { + Some(n) => n.clone(), + None => { + // Node not found in topology — mark all its shards as failed + for shard_id in shards { + failed_shards.insert( + shard_id, + NodeError::NetworkError("node not in topology".to_string()), + ); + } + continue; + } + }; + + let client_ref = client; + let req_clone = req.clone(); + let node_id_clone = node_id.clone(); + + tasks.push(async move { + let result = client_ref + .search_node(&node_id_clone, &node.address, &req_clone) + .await; + + (node_id_clone, shards, result) + }); + } + + // Await all tasks + let results = futures_util::future::join_all(tasks).await; + + for (_node_id, shards, result) in results { + match result { + Ok(body) => { + // Create a ShardHitPage for each shard served by this node + for _shard_id in shards { + shard_pages.push(ShardHitPage { body: body.clone() }); + } + } + Err(NodeError::Timeout) => { + deadline_exceeded = true; + for shard_id in shards { + failed_shards.insert(shard_id, NodeError::Timeout); + } + } + Err(e) => { + for shard_id in shards { + failed_shards.insert(shard_id, e.clone()); + } + } + } + } + + // Determine if response is partial + let partial = !failed_shards.is_empty(); + + // Apply unavailable shard policy + match policy { + UnavailableShardPolicy::Error => { + if !failed_shards.is_empty() { + return Err(crate::error::MiroirError::Routing(format!( + "{} shard(s) unavailable", + failed_shards.len() + ))); + } + } + UnavailableShardPolicy::Partial => { + // Return partial results (already done) + } + UnavailableShardPolicy::Fallback => { + // Reserved for §13.2 Phase 5: query other replica groups for failed shards + // For now, treat as Partial + } + } + + Ok(ScatterResult { + shard_pages, + failed_shards, + partial, + deadline_exceeded, + }) +} + +/// Stubs for testing (no actual network calls). + +/// Mock `NodeClient` for testing. +#[derive(Debug, Clone, Default)] +pub struct MockNodeClient { + /// Optional pre-programmed responses per node ID. + pub responses: HashMap, + + /// Optional pre-programmed errors per node ID. + pub errors: HashMap, + + /// Optional delay for simulating slow nodes. + pub delay_ms: u64, +} + +impl NodeClient for MockNodeClient { + async fn search_node( + &self, + node: &NodeId, + _address: &str, + _request: &SearchRequest, + ) -> std::result::Result { + // Simulate network delay if configured + // Note: actual sleep requires tokio runtime; this is a no-op placeholder + let _ = self.delay_ms; + + // Check for pre-programmed error + if let Some(err) = self.errors.get(node) { + return Err(err.clone()); + } + + // Return pre-programmed response or default empty response + Ok(self.responses.get(node).cloned().unwrap_or_else(|| { + serde_json::json!({ + "hits": [], + "estimatedTotalHits": 0, + "processingTimeMs": 0, + "facetDistribution": {}, + }) + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::{Node, NodeId}; + + fn make_test_topology() -> Topology { + let mut topo = Topology::new(64, 2, 2); + for i in 0u32..6 { + let rg = if i < 3 { 0 } else { 1 }; + let mut node = Node::new( + NodeId::new(format!("node-{i}")), + format!("http://node-{i}:7700"), + rg, + ); + node.status = crate::topology::NodeStatus::Active; + topo.add_node(node); + } + topo + } + + #[test] + fn test_plan_search_scatter_pure_function() { + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + + assert_eq!(plan.chosen_group, 0); + assert_eq!(plan.target_shards.len(), 64); + assert_eq!(plan.shard_to_node.len(), 64); + assert_eq!(plan.deadline_ms, 5000); + assert!(plan.hedging_eligible); + } + + #[test] + fn test_plan_search_scatter_query_group_rotation() { + let topo = make_test_topology(); + + // query_seq 0 → group 0 + let plan0 = plan_search_scatter(&topo, 0, 2, 64); + assert_eq!(plan0.chosen_group, 0); + + // query_seq 1 → group 1 + let plan1 = plan_search_scatter(&topo, 1, 2, 64); + assert_eq!(plan1.chosen_group, 1); + + // query_seq 2 → group 0 + let plan2 = plan_search_scatter(&topo, 2, 2, 64); + assert_eq!(plan2.chosen_group, 0); + } + + #[test] + fn test_plan_search_scatter_shard_to_node_mapping() { + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + + // All shards should be mapped to a node + for shard_id in 0..64 { + assert!( + plan.shard_to_node.contains_key(&shard_id), + "Shard {} not in mapping", + shard_id + ); + } + + // All nodes should be from group 0 + let g0 = topo.group(0).unwrap(); + for (_shard_id, node_id) in &plan.shard_to_node { + assert!( + g0.nodes().contains(node_id), + "Node {:?} not in group 0", + node_id + ); + } + } + + #[test] + fn test_plan_search_scatter_hedging_eligibility() { + let mut topo = Topology::new(64, 1, 1); + // Single node group + topo.add_node(Node::new( + NodeId::new("node-0".to_string()), + "http://node-0:7700".to_string(), + 0, + )); + + let plan = plan_search_scatter(&topo, 0, 1, 64); + assert!(!plan.hedging_eligible); + + // Multi-node group + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + assert!(plan.hedging_eligible); + } + + #[tokio::test] + async fn test_execute_scatter_with_mock_client() { + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + + let mut client = MockNodeClient::default(); + client.responses.insert( + NodeId::new("node-0".to_string()), + serde_json::json!({ + "hits": [{"id": "doc1", "title": "Test"}], + "estimatedTotalHits": 1, + "processingTimeMs": 5, + }), + ); + + let req = SearchRequest { + index_uid: "test".to_string(), + query: Some("test".to_string()), + offset: 0, + limit: 10, + filter: None, + facets: None, + ranking_score: false, + body: serde_json::json!({}), + }; + + let result = execute_scatter(plan, &client, req, &topo, UnavailableShardPolicy::Partial) + .await + .unwrap(); + + assert!(!result.partial); + assert!(!result.deadline_exceeded); + assert_eq!(result.shard_pages.len(), 64); // One page per shard + assert!(result.failed_shards.is_empty()); + } + + #[tokio::test] + async fn test_execute_scatter_partial_failure() { + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + + let mut client = MockNodeClient::default(); + // Make node-0 fail + client.errors.insert( + NodeId::new("node-0".to_string()), + NodeError::Timeout, + ); + client.responses.insert( + NodeId::new("node-1".to_string()), + serde_json::json!({ + "hits": [], + "estimatedTotalHits": 0, + "processingTimeMs": 0, + }), + ); + + let req = SearchRequest { + index_uid: "test".to_string(), + query: Some("test".to_string()), + offset: 0, + limit: 10, + filter: None, + facets: None, + ranking_score: false, + body: serde_json::json!({}), + }; + + let result = execute_scatter(plan, &client, req, &topo, UnavailableShardPolicy::Partial) + .await + .unwrap(); + + assert!(result.partial); + assert!(!result.failed_shards.is_empty()); + // Some shards should still succeed (those on node-1 and node-2) + assert!(!result.shard_pages.is_empty()); + } + + #[tokio::test] + async fn test_execute_scatter_error_policy() { + let topo = make_test_topology(); + let plan = plan_search_scatter(&topo, 0, 2, 64); + + let mut client = MockNodeClient::default(); + client.errors.insert( + NodeId::new("node-0".to_string()), + NodeError::Timeout, + ); + + let req = SearchRequest { + index_uid: "test".to_string(), + query: Some("test".to_string()), + offset: 0, + limit: 10, + filter: None, + facets: None, + ranking_score: false, + body: serde_json::json!({}), + }; + + let result = execute_scatter(plan, &client, req, &topo, UnavailableShardPolicy::Error).await; + + assert!(result.is_err()); + } + + #[test] + fn test_node_error_variants() { + let timeout = NodeError::Timeout; + assert!(matches!(timeout, NodeError::Timeout)); + + let http_err = NodeError::HttpError { + status: 500, + body: "Internal Server Error".to_string(), + }; + assert!(matches!(http_err, NodeError::HttpError { .. })); + + let net_err = NodeError::NetworkError("connection refused".to_string()); + assert!(matches!(net_err, NodeError::NetworkError(_))); } } diff --git a/crates/miroir-core/tests/router_proptest.proptest-regressions b/crates/miroir-core/tests/router_proptest.proptest-regressions new file mode 100644 index 0000000..2dd6a1a --- /dev/null +++ b/crates/miroir-core/tests/router_proptest.proptest-regressions @@ -0,0 +1,11 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc c28f459e669c9568d88ad15c1e23b7900c447c465fb405ac76ca127d5a5d7d69 # shrinks to shard_count = 23, node_count = 5, rf = 2 +cc 5e225fd981de25480c87d5a319e25c7e85e68ec4e7d2674e4d09b38939c916fc # shrinks to shard_count = 17, node_count = 8, rf = 1 +cc 2f0a493a306793420b97ed4c9c15f507f69e558169e0e47035ecfffd2de0d40c # shrinks to shard_count = 10, node_count = 6, rf = 1 +cc 819e50f063ba8bc32df8ed79259c439512ee816a34e0cd02a56e312c46447812 # shrinks to shard_count = 20, node_count = 7, rf = 1 +cc 8afd2c631dd0aae67601ab56fd1decde56c5c5048f5a79394913796e96f6a29f # shrinks to shard_count = 30, node_count = 4, rf = 1 diff --git a/tests/benches/score-comparability/simulate.py b/tests/benches/score-comparability/simulate.py index ceca4af..1541962 100755 --- a/tests/benches/score-comparability/simulate.py +++ b/tests/benches/score-comparability/simulate.py @@ -245,6 +245,80 @@ def simulate_distributed_search( } +RRF_K = 60 # RRF constant, matching merger.rs + + +def simulate_distributed_search_rrf( + shards: Dict[int, List[Dict]], + shard_stats: Dict[int, Tuple[Dict, int, float]], + query: Dict, + limit: int = 100, +) -> Dict: + """ + Simulate distributed search using Reciprocal Rank Fusion. + + RRF score for a document: sum over shards of 1/(k + rank + 1) + where rank is 0-based position in shard's result list. + + This avoids the score comparability issue entirely because + RRF only uses rank position, not raw scores. + """ + query_terms = tokenize(query["q"]) + per_shard_limit = limit * 2 + + # Accumulate RRF scores per document + rrf_scores: Dict[str, float] = defaultdict(float) + doc_info: Dict[str, Tuple[Dict, int]] = {} # id -> (doc, shard_id) + + for shard_id, docs in shards.items(): + df, N, avgdl = shard_stats[shard_id] + + if query.get("filter"): + category_filter = query["filter"].split("=")[1].strip() + filtered_docs = [d for d in docs if d["category"] == category_filter] + else: + filtered_docs = docs + + scores = [] + for doc in filtered_docs: + score = score_document_bm25(doc, query_terms, df, N, avgdl) + if score > 0: + scores.append((doc, score)) + + scores.sort(key=lambda x: x[1], reverse=True) + + for rank, (doc, _score) in enumerate(scores[:per_shard_limit]): + doc_id = doc["id"] + rrf_contribution = 1.0 / (RRF_K + rank + 1) + rrf_scores[doc_id] += rrf_contribution + if doc_id not in doc_info: + doc_info[doc_id] = (doc, shard_id) + + # Sort by RRF score descending + sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True) + + hits = [] + for doc_id, rrf_score in sorted_docs[:limit]: + doc, shard_id = doc_info[doc_id] + hits.append({ + "id": doc_id, + "title": doc["title"], + "score": rrf_score, + "shard": shard_id, + }) + + return { + "query_id": query["id"], + "type": query.get("type", "unknown"), + "q": query["q"], + "filter": query.get("filter"), + "hits": hits, + "total_hits": len(sorted_docs), + "shards_queried": list(shards.keys()), + "merge_strategy": "rrf", + } + + def run_experiment( corpus_dir: Path, query_file: Path, @@ -293,10 +367,13 @@ def run_experiment( ground_truth_file = output_dir / "ground-truth.jsonl" distributed_file = output_dir / "distributed.jsonl" + rrf_file = output_dir / "distributed-rrf.jsonl" print(f"\nRunning experiments...") - with open(ground_truth_file, "w") as gt_f, open(distributed_file, "w") as dist_f: + with open(ground_truth_file, "w") as gt_f, \ + open(distributed_file, "w") as dist_f, \ + open(rrf_file, "w") as rrf_f: for i, query in enumerate(queries): if (i + 1) % 1000 == 0: print(f" Processed {i + 1} queries...") @@ -305,16 +382,23 @@ def run_experiment( gt_result = simulate_search(docs, query, global_stats, limit) gt_f.write(json.dumps(gt_result) + "\n") - # Distributed: each shard uses local statistics + # Distributed: each shard uses local statistics (score-based merge) dist_result = simulate_distributed_search( shards, shard_stats, query, limit ) dist_f.write(json.dumps(dist_result) + "\n") + # RRF: rank-based merge (no score comparability needed) + rrf_result = simulate_distributed_search_rrf( + shards, shard_stats, query, limit + ) + rrf_f.write(json.dumps(rrf_result) + "\n") + print(f" Completed {len(queries)} queries") print(f"\nResults saved to:") print(f" {ground_truth_file}") print(f" {distributed_file}") + print(f" {rrf_file}") # Save experiment metadata exp_meta = { @@ -323,6 +407,8 @@ def run_experiment( "shard_count": shard_count, "limit": limit, "total_queries": len(queries), + "merge_strategies": ["score", "rrf"], + "rrf_k": RRF_K, "global_stats": {"N": global_stats[1], "avgdl": global_stats[2]}, "shard_stats": { str(k): {"N": v[1], "avgdl": v[2]} @@ -381,6 +467,7 @@ def main(): print("\nTo compare results, run:") print(f" python3 {output_dir}/compare.py {output_dir}/ground-truth.jsonl {output_dir}/distributed.jsonl --verbose") + print(f" python3 {output_dir}/compare.py {output_dir}/ground-truth.jsonl {output_dir}/distributed-rrf.jsonl --verbose") if __name__ == "__main__":