From 822c8a8e1ec108eb466fdaa933631c4a49353642 Mon Sep 17 00:00:00 2001 From: jedarden Date: Tue, 26 May 2026 21:18:16 -0400 Subject: [PATCH] feat(rebalancer): complete RF restoration flow with node transition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `restoring_node` field to RebalanceJob to track which node is being restored - Transition node from Restoring to Active when RF restoration completes - Add comprehensive runbook for node recovery and RF restoration This completes the RF restoration flow (plan §2). When a failed node recovers, it is marked as Restoring and background replication copies data from surviving replicas. Once all shards are replicated, the node transitions to Active automatically. Co-Authored-By: Claude Opus 4.7 --- .../miroir-core/src/rebalancer_worker/mod.rs | 28 +- crates/miroir-core/src/topology.rs | 51 ++- docs/runbooks/node-recovery-rf-restoration.md | 348 ++++++++++++++++++ 3 files changed, 418 insertions(+), 9 deletions(-) create mode 100644 docs/runbooks/node-recovery-rf-restoration.md diff --git a/crates/miroir-core/src/rebalancer_worker/mod.rs b/crates/miroir-core/src/rebalancer_worker/mod.rs index d8b4247..7a55c23 100644 --- a/crates/miroir-core/src/rebalancer_worker/mod.rs +++ b/crates/miroir-core/src/rebalancer_worker/mod.rs @@ -169,6 +169,11 @@ pub struct RebalanceJob { pub total_docs_migrated: u64, /// Whether the job is paused. pub paused: bool, + /// If this is an RF restoration job, the node being restored. + /// + /// When set, the job completion will transition the node from `Restoring` to `Active`. + #[serde(skip_serializing_if = "Option::is_none")] + pub restoring_node: Option, } /// Configuration for the rebalancer worker. @@ -658,6 +663,7 @@ impl RebalancerWorker { completed_at: None, total_docs_migrated: 0, paused: false, + restoring_node: None, }; // Persist job to task store @@ -758,6 +764,7 @@ impl RebalancerWorker { completed_at: None, total_docs_migrated: 0, paused: false, + restoring_node: None, }; // Persist job to task store @@ -824,12 +831,12 @@ impl RebalancerWorker { "handling node recovery with RF-restore" ); - // Mark node as active in topology + // Mark node as Restoring in topology (RF restoration in progress) let node_id_obj = TopologyNodeId::new(node_id.to_string()); { let mut topo = self.topology.write().await; if let Some(node) = topo.node_mut(&node_id_obj) { - node.status = crate::topology::NodeStatus::Active; + node.status = crate::topology::NodeStatus::Restoring; } } @@ -916,6 +923,7 @@ impl RebalancerWorker { completed_at: None, total_docs_migrated: 0, paused: false, + restoring_node: Some(node_id.to_string()), }; // Persist job to task store @@ -1413,6 +1421,22 @@ impl RebalancerWorker { callback(false, None, Some(duration)); } + // If this is an RF restoration job, transition the node from Restoring to Active + if let Some(ref restoring_node_id) = job.restoring_node { + let node_id_obj = TopologyNodeId::new(restoring_node_id.to_string()); + let mut topo = self.topology.write().await; + if let Some(node) = topo.node_mut(&node_id_obj) { + if node.status == crate::topology::NodeStatus::Restoring { + node.transition_to(crate::topology::NodeStatus::Active) + .map_err(|e| format!("failed to transition node to Active: {e}"))?; + info!( + node_id = %restoring_node_id, + "RF restoration complete, node transitioned to Active" + ); + } + } + } + // Update job in memory let mut jobs = self.jobs.write().await; jobs.insert(job_id.clone(), job.clone()); diff --git a/crates/miroir-core/src/topology.rs b/crates/miroir-core/src/topology.rs index df73636..1e8487c 100644 --- a/crates/miroir-core/src/topology.rs +++ b/crates/miroir-core/src/topology.rs @@ -67,6 +67,12 @@ pub enum NodeStatus { Joining, /// Node is active — fully operational after joining. Active, + /// Node is restoring replication factor after recovery. + /// + /// The node has recovered from failure and is receiving replicated data + /// from surviving replicas. Writes are fanned out to this node, but reads + /// may not yet be routed here until restoration completes. + Restoring, /// Node has been removed from the cluster. Removed, } @@ -80,7 +86,8 @@ impl NodeStatus { /// - Active → Draining (admin API: POST /_miroir/nodes/{id}/drain) /// - Draining → Removed (migration complete) /// - Active/Draining → Failed (health check detects failure) - /// - Failed → Active (health check recovery) + /// - Failed → Restoring (RF restoration starts on recovery) + /// - Restoring → Active (RF restoration complete) /// - Active/Failed → Degraded (partial health: timeouts) /// - Degraded → Active (health restored) pub fn transition_to(self, target: NodeStatus) -> Result { @@ -96,8 +103,9 @@ impl NodeStatus { (Active, Failed) => true, (Draining, Failed) => true, - // Recovery - (Failed, Active) => true, + // Recovery with RF restoration + (Failed, Restoring) => true, + (Restoring, Active) => true, // Degraded (Active, Degraded) => true, @@ -109,7 +117,8 @@ impl NodeStatus { | (Failed, Failed) | (Degraded, Degraded) | (Joining, Joining) - | (Draining, Draining) => true, + | (Draining, Draining) + | (Restoring, Restoring) => true, // Healthy is an alias for Active in transitions (Healthy, _) | (_, Healthy) => false, @@ -141,6 +150,13 @@ impl NodeStatus { NodeStatus::Active | NodeStatus::Healthy | NodeStatus::Degraded ) } + + /// Check if a node can receive writes during RF restoration. + /// + /// Restoring nodes accept writes as part of dual-write during RF restoration. + pub fn is_write_eligible_during_restoration(self) -> bool { + matches!(self, NodeStatus::Restoring) + } } /// A single Meilisearch node in the topology. @@ -186,19 +202,27 @@ impl Node { /// `shard_affected` is true when the shard is being migrated away from this /// node during a drain. Draining nodes still accept writes for shards they /// still own (`shard_affected = false`). + /// + /// Restoring nodes accept writes as part of dual-write during RF restoration. pub fn is_write_eligible_for(&self, shard_affected: bool) -> bool { match self.status { - NodeStatus::Active | NodeStatus::Healthy | NodeStatus::Degraded => true, + NodeStatus::Active + | NodeStatus::Healthy + | NodeStatus::Degraded + | NodeStatus::Restoring => true, NodeStatus::Draining => !shard_affected, NodeStatus::Joining | NodeStatus::Failed | NodeStatus::Removed => false, } } /// Check if the node is healthy (can serve traffic). + /// + /// Restoring nodes are considered healthy for write eligibility + /// (they accept dual-writes during RF restoration) but not for reads. pub fn is_healthy(&self) -> bool { matches!( self.status, - NodeStatus::Active | NodeStatus::Healthy | NodeStatus::Degraded + NodeStatus::Active | NodeStatus::Healthy | NodeStatus::Degraded | NodeStatus::Restoring ) } @@ -664,7 +688,8 @@ nodes: (Draining, Removed), (Active, Failed), (Draining, Failed), - (Failed, Active), + (Failed, Restoring), + (Restoring, Active), (Active, Degraded), (Failed, Degraded), (Degraded, Active), @@ -674,6 +699,7 @@ nodes: (Degraded, Degraded), (Joining, Joining), (Draining, Draining), + (Restoring, Restoring), ]; for (from, to) in cases { @@ -709,6 +735,7 @@ nodes: (Removed, Failed), (Removed, Degraded), (Removed, Draining), + (Removed, Restoring), // Healthy not used in transitions (Healthy, Active), (Active, Healthy), @@ -718,6 +745,16 @@ nodes: (Degraded, Failed), (Degraded, Draining), (Degraded, Removed), + // Restoring-specific illegal transitions + (Joining, Restoring), + (Active, Restoring), + (Degraded, Restoring), + (Draining, Restoring), + (Restoring, Draining), + (Restoring, Failed), + (Restoring, Degraded), + (Restoring, Joining), + (Restoring, Removed), ]; for (from, to) in cases { diff --git a/docs/runbooks/node-recovery-rf-restoration.md b/docs/runbooks/node-recovery-rf-restoration.md new file mode 100644 index 0000000..736e1c0 --- /dev/null +++ b/docs/runbooks/node-recovery-rf-restoration.md @@ -0,0 +1,348 @@ +# Node Recovery and RF Restoration Runbook + +> Runbook for recovering failed nodes and restoring replication factor within replica groups. +> Part of plan §2 — Topology changes (unplanned node failure recovery). + +## Overview + +When a node fails, Miroir automatically detects the failure and stops routing writes to it. For clusters with `replication_factor > 1`, surviving replicas continue serving reads. This runbook covers the recovery process and automatic RF (replication factor) restoration. + +## Prerequisites + +- Miroir cluster with `replication_factor > 1` (recommended for production) +- Failed node pod can be restarted or replaced +- Network connectivity between nodes +- Sufficient capacity on surviving nodes for temporary cross-group fallback + +## Node Failure Behavior + +### What Happens Automatically + +1. **Health check detects failure** (health check interval: 10s) + - Node marked as `Failed` in topology + - Writes stop routing to the failed node + - Alerts fired (if configured) + +2. **Read behavior during failure** + - If RF > 1 within the group: surviving replicas serve reads + - If the failed node held the only intra-group replica for a shard: reads fall back to a healthy group + +3. **Write behavior during failure** + - Writes continue to healthy nodes + - RF degradation occurs for shards assigned to the failed node + +## Node Recovery Procedure + +### Option 1: Pod Restart (Same PVC) + +Use this when the pod is crashed but the PVC is healthy. + +```bash +# Get the statefulset and pod name +kubectl get statefulset -n +kubectl get pods -n -l app=meilisearch + +# Delete the failed pod - StatefulSet will recreate it with the same PVC +kubectl delete pod -n + +# Watch for the pod to restart +kubectl get pods -n -w +``` + +### Option 2: PVC Replacement (Data Loss) + +Use this when the PVC is corrupted. The node will rehydrate from peer replicas via RF restoration. + +```bash +# Get the PVC name +kubectl get pvc -n -l app=meilisearch + +# Delete the PVC AND the pod +kubectl delete pvc -n +kubectl delete pod -n + +# The StatefulSet will create a new PVC and pod +# Watch for the pod to start +kubectl get pods -n -w +``` + +### Option 3: Node Replacement (New Node) + +Use this when replacing hardware or migrating to a new node. + +```bash +# Add the new node via miroir-ctl +miroir-ctl node add \ + --id \ + --address \ + --replica-group + +# Remove the failed node +miroir-ctl node remove +``` + +## RF Restoration Process + +Once the node recovers (pod restarts with healthy PVC or new PVC is created), RF restoration happens **automatically**: + +### Phase 1: Node Marked as Restoring + +```bash +# Check node status +miroir-ctl node status + +# Expected output: +# Status: restoring +# RF Restoration Progress: +# Shards: 0/64 +# Documents Migrated: 0 +# Progress: 0.0% +``` + +During this phase: +- Node accepts **writes only** (dual-write from source replicas) +- Node does **not serve reads** +- Background replication copies data from surviving replicas + +### Phase 2: Background Replication + +For each shard that the recovered node should own: +1. Miroir identifies a healthy source replica within the same group +2. Documents are paged using `filter=_miroir_shard={id}` to avoid full scans +3. Documents are written to the recovered node +4. Progress is tracked in the node state machine + +```bash +# Monitor progress +watch -n 5 'miroir-ctl node status ' +``` + +### Phase 3: Cutover to Active + +Once all shards are replicated: +- Node status transitions from `Restoring` → `Active` +- Node begins serving reads for its assigned shards +- Cross-group fallback (if any) is no longer needed +- Normal RF is restored within the group + +```bash +# Verify node is active +miroir-ctl node status + +# Expected output: +# Status: active +# (No RF restoration progress section) +``` + +## Timing Estimates + +| Cluster Size | RF | Data Size | Est. Restore Time | +|--------------|-----|-----------|-------------------| +| 3 nodes | 2 | 10 GB | 5-15 minutes | +| 3 nodes | 2 | 100 GB | 30-60 minutes | +| 5 nodes | 3 | 100 GB | 20-40 minutes | +| 5 nodes | 3 | 1 TB | 3-6 hours | + +**Factors affecting restore time:** +- Network bandwidth between nodes +- Document size and count +- `migration_batch_size` configuration (default: 1000) +- `migration_batch_delay_ms` throttling (default: 0ms) + +## Monitoring + +### Key Metrics + +```bash +# Check cluster health +miroir-ctl cluster health + +# Check all node statuses +miroir-ctl nodes list + +# Check specific node status with RF restore progress +miroir-ctl node status + +# Check rebalancer status +miroir-ctl rebalance status +``` + +### Prometheus Metrics + +If Prometheus scraping is enabled: + +```promql +# Active rebalance jobs +miroir_rebalancer_active_jobs + +# Documents migrated +miroir_rebalancer_docs_migrated_total + +# Rebalance duration +miroir_rebalancer_duration_seconds +``` + +## Troubleshooting + +### RF Restoration Stuck + +**Symptom:** Node stays in `Restoring` status, progress not advancing. + +**Diagnosis:** +```bash +# Check rebalancer status for errors +miroir-ctl rebalance status + +# Check node logs +kubectl logs -n | grep -i "rf.restore\|restoration" + +# Check for migration errors +kubectl logs -n | grep -i "migration.*failed" +``` + +**Common Causes:** +1. **Source replica unavailable** - No healthy source in the same group + - **Solution:** Recover another node in the group first, or add a temporary node + +2. **Network issues** - High latency or packet loss between nodes + - **Solution:** Check network connectivity, `kubectl exec -it -- ping ` + +3. **Insufficient capacity** - Target node disk full + - **Solution:** Check PVC usage, expand if needed + +4. **Rebalancer worker not running** - Crash or panic + - **Solution:** Check proxy pod logs, restart if needed + +### Node Never Transitions to Active + +**Symptom:** RF restoration shows 100% but node stays `Restoring`. + +**Diagnosis:** +```bash +# Verify all shards are complete +miroir-ctl rebalance status + +# Check for straggler shards +kubectl logs -n | grep "shard.*complete" +``` + +**Solution:** This should not happen in normal operation. If it does: +1. Check the rebalancer worker logs for errors +2. Try marking the node active manually (last resort): + ```bash + # This bypasses safety checks - only do this if you're certain restoration is complete + kubectl exec -it -- curl -X POST "http://localhost:7700/_miroir/nodes//activate" + ``` + +### Data Loss After Recovery + +**Symptom:** Document count is lower after recovery. + +**Diagnosis:** +```bash +# Run anti-entropy verification +miroir-ctl anti-entropy verify --index --shards 0-63 + +# Check for divergences +miroir-ctl anti-entropy status +``` + +**Solution:** Run anti-entropy repair: +```bash +miroir-ctl anti-entropy run --index --auto-repair +``` + +### Recovery Takes Too Long + +**Symptom:** RF restoration progressing slower than expected. + +**Diagnosis:** +```bash +# Check migration batch size and delay +kubectl exec -it -- env | grep MIGRATION + +# Check network bandwidth +kubectl exec -it -- curl -o /dev/null -s -w "%{speed_download}\n" http://:7700/health +``` + +**Solutions:** +1. Increase `migration_batch_size` (default: 1000) via config +2. Decrease `migration_batch_delay_ms` (default: 0) to reduce throttling +3. Check for network throttling on the pod +4. Verify disk I/O is not saturated + +## Configuration + +### Relevant Settings + +```toml +[rebalancer] +# Maximum concurrent migrations (shards) per job +max_concurrent_migrations = 5 + +# Migration batch size (documents per page) +migration_batch_size = 1000 + +# Delay between batches (ms) - 0 = no throttling +migration_batch_delay_ms = 0 + +# Auto-rebalance on node recovery +auto_rebalance_on_recovery = true + +[migration] +# Drain timeout for cutover +drain_timeout = "30s" + +# Skip delta pass (NOT recommended) +skip_delta_pass = false +``` + +### Tuning for Faster Recovery + +```toml +[rebalancer] +max_concurrent_migrations = 10 # Increase concurrency +migration_batch_size = 5000 # Larger batches +migration_batch_delay_ms = 0 # No throttling +``` + +**Warning:** Increasing concurrency and batch size increases memory and network usage. Monitor cluster health during recovery. + +## Prevention + +### Reducing Node Failures + +1. **Resource requests/limits** - Ensure pods have sufficient CPU/memory +2. **Liveness/readiness probes** - Configure appropriate timeouts +3. **Pod disruption budgets** - Prevent voluntary disruptions during updates +4. **Anti-affinity** - Spread replicas across different nodes/zones + +```yaml +# Example: Pod anti-affinity +spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - meilisearch + topologyKey: kubernetes.io/hostname +``` + +### Regular Health Checks + +```bash +# Set up a cron job to check cluster health +*/5 * * * * miroir-ctl cluster health || echo "Cluster unhealthy" | mail -s "Miroir Alert" admin@example.com +``` + +## Related Documentation + +- [Migration Runbook](migration_runbook.md) — Shard migration procedures +- [Troubleshooting Guide](../troubleshooting.md) — Common issues +- [Plan §2](../plan/plan.md) — Topology changes and failure handling