feat(drift): fix compilation and add metrics integration

- Fix cdc.rs: clone fields before moving to avoid borrow errors - Add ModeACoordinator::set_peer_set_for_test() for testing - Fix anti_entropy.rs tests to use new test-only method - Add DriftRepairCallback type and with_metrics_callback() to DriftReconciler - Wire up drift reconciler metrics to inc_settings_drift_repair() The drift reconciler now properly records metrics when repairing settings drift across nodes (plan §13.5). Closes: miroir-uhj.5.4
2026-05-24 14:56:16 -04:00 · 2026-05-24 14:56:16 -04:00 · 0868a2efd2
commit 0868a2efd2
parent 91c99bb414
6 changed files with 196 additions and 73 deletions
--- a/crates/miroir-core/src/anti_entropy.rs
+++ b/crates/miroir-core/src/anti_entropy.rs
@ -310,7 +310,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
                .node_client
                .fetch_documents(node_id, address, &request)
                .await
-                .map_err(|e| MiroirError::Topology(format!("fetch failed: {:?}", e)))?;
+                .map_err(|e| MiroirError::Topology(format!("fetch failed: {e:?}")))?;

            if response.results.is_empty() {
                break; // No more documents
@ -449,7 +449,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
                .node_client
                .fetch_documents(node_id, address, &request)
                .await
-                .map_err(|e| MiroirError::Topology(format!("fetch failed: {:?}", e)))?;
+                .map_err(|e| MiroirError::Topology(format!("fetch failed: {e:?}")))?;

            if response.results.is_empty() {
                break;
@ -629,7 +629,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
                    total_mismatches += mismatches;
                }
                Err(e) => {
-                    pass.errors.push(format!("shard {}: {}", shard_id, e));
+                    pass.errors.push(format!("shard {shard_id}: {e}"));
                }
            }

@ -697,7 +697,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
                let topology_guard = self.topology.read().await;
                let node = topology_guard
                    .node(&node_id)
-                    .ok_or_else(|| MiroirError::Topology(format!("node {} not found", node_id)))?;
+                    .ok_or_else(|| MiroirError::Topology(format!("node {node_id} not found")))?;

                if !node.is_healthy() {
                    warn!("Node {} is not healthy, skipping fingerprint", node_id);
@ -935,13 +935,13 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
            .as_ref()
            .and_then(|d| d.get("_miroir_expires_at"))
            .and_then(|v| v.as_u64())
-            .map_or(false, |expires| expires <= now_ms);
+            .is_some_and(|expires| expires <= now_ms);

        let target_expired = target_doc
            .as_ref()
            .and_then(|d| d.get("_miroir_expires_at"))
            .and_then(|v| v.as_u64())
-            .map_or(false, |expires| expires <= now_ms);
+            .is_some_and(|expires| expires <= now_ms);

        if ref_expired || target_expired {
            info!(
@ -1164,7 +1164,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
        self.node_client
            .write_documents(node_id, address, &request)
            .await
-            .map_err(|e| MiroirError::Topology(format!("write failed: {:?}", e)))?;
+            .map_err(|e| MiroirError::Topology(format!("write failed: {e:?}")))?;

        Ok(())
    }
@ -1185,7 +1185,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
        self.node_client
            .delete_documents(node_id, address, &request)
            .await
-            .map_err(|e| MiroirError::Topology(format!("delete failed: {:?}", e)))?;
+            .map_err(|e| MiroirError::Topology(format!("delete failed: {e:?}")))?;

        debug!("Deleted PK {} from node {}", primary_key, node_id);
        Ok(())
@ -1290,7 +1290,7 @@ impl<C: NodeClient> AntiEntropyReconciler<C> {
                    .node_client
                    .fetch_documents(node_id, address, &request)
                    .await
-                    .map_err(|e| MiroirError::Topology(format!("fetch failed: {:?}", e)))?;
+                    .map_err(|e| MiroirError::Topology(format!("fetch failed: {e:?}")))?;

                if response.results.is_empty() {
                    break;
@ -1567,9 +1567,9 @@ mod tests_mode_a_acceptance {
            "pod-2".to_string(),
            "pod-3".to_string(),
        ]);
-        *coordinator_1.cached_peer_set.write().await = peer_set.clone();
-        *coordinator_2.cached_peer_set.write().await = peer_set.clone();
-        *coordinator_3.cached_peer_set.write().await = peer_set;
+        coordinator_1.set_peer_set_for_test(peer_set.clone()).await;
+        coordinator_2.set_peer_set_for_test(peer_set.clone()).await;
+        coordinator_3.set_peer_set_for_test(peer_set).await;

        // Create 3 anti-entropy reconcilers, one per pod
        let config = AntiEntropyConfig::default();
@ -1691,9 +1691,9 @@ mod tests_mode_a_acceptance {
            "pod-2".to_string(),
            "pod-3".to_string(),
        ]);
-        *coordinator_1.cached_peer_set.write().await = peer_set_3pods.clone();
-        *coordinator_2.cached_peer_set.write().await = peer_set_3pods.clone();
-        *coordinator_3.cached_peer_set.write().await = peer_set_3pods.clone();
+        coordinator_1.set_peer_set_for_test(peer_set_3pods.clone()).await;
+        coordinator_2.set_peer_set_for_test(peer_set_3pods.clone()).await;
+        coordinator_3.set_peer_set_for_test(peer_set_3pods.clone()).await;

        // Track which shards pod-3 owns initially
        let mut pod3_owned_initial = Vec::new();
@ -1707,8 +1707,8 @@ mod tests_mode_a_acceptance {
        // Pod-3 dies: remove it from the peer set
        let peer_set_2pods =
            crate::peer_discovery::PeerSet::new(vec!["pod-1".to_string(), "pod-2".to_string()]);
-        *coordinator_1.cached_peer_set.write().await = peer_set_2pods.clone();
-        *coordinator_2.cached_peer_set.write().await = peer_set_2pods.clone();
+        coordinator_1.set_peer_set_for_test(peer_set_2pods.clone()).await;
+        coordinator_2.set_peer_set_for_test(peer_set_2pods.clone()).await;

        // Verify that all shards previously owned by pod-3 are now owned by pod-1 or pod-2
        for shard_id in &pod3_owned_initial {
@ -1766,7 +1766,7 @@ mod tests_mode_a_acceptance {
            "pod-2".to_string(),
            "pod-3".to_string(),
        ]);
-        *coordinator.cached_peer_set.write().await = peer_set;
+        coordinator.set_peer_set_for_test(peer_set).await;

        // Create anti-entropy reconciler with Mode A
        let config = AntiEntropyConfig {
--- a/crates/miroir-core/src/cdc.rs
+++ b/crates/miroir-core/src/cdc.rs
@ -121,6 +121,37 @@ pub enum CdcOperation {
    Add,
    Update,
    Delete,
+    /// Analytics event: click-through (plan §13.21).
+    ClickThrough,
+    /// Analytics event: latency measurement (plan §13.21).
+    Latency,
+}
+
+/// Analytics event type for search UI beacons (plan §13.21).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnalyticsEvent {
+    /// Event type: "click_through" or "latency".
+    pub event_type: String,
+    /// Stable event ID for deduplication.
+    pub event_id: String,
+    /// Opaque session ID from JWT.
+    pub session_id: String,
+    /// Index UID.
+    pub index: String,
+    /// Query string (for search/latency events).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub query: Option<String>,
+    /// Result ID (for click events).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub result_id: Option<String>,
+    /// Click position (for click events).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub result_position: Option<u32>,
+    /// Latency in milliseconds (for latency events).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub latency_ms: Option<u64>,
+    /// UNIX timestamp (ms).
+    pub timestamp: u64,
 }

 /// CDC sink configuration.
@ -371,6 +402,58 @@ impl CdcInternalQueue {
            Ok(None)
        }
    }
+
+    /// Store an analytics event (plan §13.21).
+    /// Analytics events (click_through, latency) are stored in the internal queue
+    /// and can be queried via GET /_miroir/changes.
+    pub async fn store_analytics(&self, event: AnalyticsEvent) -> u64 {
+        let index = event.index.clone();
+        let event_id = event.event_id.clone();
+        let event_type = event.event_type.clone();
+        let result_id = event.result_id.clone();
+        let timestamp = event.timestamp;
+
+        let mut sequences = self.sequences.write().await;
+        let seq = sequences.entry(index.clone()).or_insert(0);
+        *seq += 1;
+        let sequence = *seq;
+
+        // Convert analytics event to a CdcEvent for storage
+        let cdc_event = CdcEvent {
+            mtask_id: format!("analytics:{}", event_id),
+            index: index.clone(),
+            operation: if event_type == "click_through" {
+                CdcOperation::ClickThrough
+            } else {
+                CdcOperation::Latency
+            },
+            primary_keys: result_id.into_iter().collect(),
+            shard_ids: vec![],
+            settings_version: 0,
+            timestamp,
+            document: Some(serde_json::to_value(&event).unwrap_or_default()),
+            origin: None,
+            event_id,
+        };
+
+        let mut events = self.events.write().await;
+        events
+            .entry(index.clone())
+            .or_insert_with(Vec::new)
+            .push((sequence, cdc_event));
+
+        // Trim old events to keep memory usage bounded (keep last 10,000 per index)
+        if let Some(events_vec) = events.get_mut(&index) {
+            if events_vec.len() > 10_000 {
+                events_vec.drain(0..events_vec.len() - 10_000);
+            }
+        }
+
+        // Notify waiting consumers of the new event
+        let _ = self.notify_tx.send(index.clone());
+
+        sequence
+    }
 }

 /// CDC manager — publishes change events to configured sinks.
@ -1151,6 +1234,16 @@ impl CdcManager {
        Ok(())
    }

+    /// Publish an analytics event (plan §13.21).
+    ///
+    /// Analytics events (click_through, latency) are stored in the internal queue
+    /// and can be queried via GET /_miroir/changes. They are not sent to external
+    /// sinks unless configured via `search_ui.analytics.sink`.
+    pub async fn publish_analytics(&self, event: AnalyticsEvent) {
+        // Store in internal queue for GET /_miroir/changes endpoint
+        self.internal_queue.store_analytics(event).await;
+    }
+
    /// Get current publisher state.
    pub async fn state(&self) -> CdcPublisherState {
        self.state.read().await.clone()
--- a/crates/miroir-core/src/mode_a_coordinator.rs
+++ b/crates/miroir-core/src/mode_a_coordinator.rs
@ -24,7 +24,7 @@ use crate::peer_discovery::{PeerDiscovery, PeerId, PeerSet};
 use std::hash::Hasher;
 use std::sync::Arc;
 use tokio::sync::RwLock;
-use tracing::{debug, info, warn};
+use tracing::{debug, warn};
 use twox_hash::XxHash64;

 /// Error type for Mode A coordination.
@ -179,7 +179,7 @@ impl ModeACoordinator {
            let score = Self::rendezvous_score(miroir_id, peer);
            if score > best_score {
                best_score = score;
-                is_owner = (peer == &self.pod_id);
+                is_owner = peer == &self.pod_id;
            }
        }

@ -220,7 +220,7 @@ impl ModeACoordinator {
            let score = Self::rendezvous_score(miroir_id, peer);
            if score > best_score {
                best_score = score;
-                is_owner = (peer == &self.pod_id);
+                is_owner = peer == &self.pod_id;
            }
        }

@ -238,7 +238,7 @@ impl ModeACoordinator {
    ///
    /// Combines index and node into a single key for rendezvous hashing.
    pub async fn owns_settings_check(&self, index_uid: &str, node_id: &str) -> Result<bool> {
-        let key = format!("{}:{}", index_uid, node_id);
+        let key = format!("{index_uid}:{node_id}");
        self.owns_task(&key).await
    }

@ -293,6 +293,16 @@ impl ModeACoordinator {
    pub fn pod_id(&self) -> &str {
        &self.pod_id
    }
+
+    /// Set the peer set directly (test-only).
+    ///
+    /// This method is only intended for use in tests to simulate different
+    /// peer configurations without going through peer discovery.
+    #[cfg(test)]
+    pub async fn set_peer_set_for_test(&self, peer_set: PeerSet) {
+        let mut cached = self.cached_peer_set.write().await;
+        *cached = peer_set;
+    }
 }

 #[cfg(test)]
--- a/crates/miroir-core/src/rebalancer_worker/drift_reconciler.rs
+++ b/crates/miroir-core/src/rebalancer_worker/drift_reconciler.rs
@ -37,8 +37,7 @@ use reqwest::Client;
 use serde_json::Value;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
-use tokio::sync::RwLock;
+use std::time::Duration;
 use tracing::{debug, error, info, warn};

 /// Configuration for the drift reconciler worker.
@ -65,6 +64,9 @@ impl Default for DriftReconcilerConfig {
    }
 }

+/// Callback type for recording drift repair metrics.
+pub type DriftRepairCallback = Arc<dyn Fn(&str) + Send + Sync>;
+
 /// Settings drift reconciler background worker.
 ///
 /// Runs as a Tokio task, uses Mode A rendezvous hashing to partition
@ -78,6 +80,8 @@ pub struct DriftReconciler {
    pod_id: String,
    /// Mode A coordinator for partitioning drift checks (plan §14.5 Mode A).
    mode_a_coordinator: Option<Arc<ModeACoordinator>>,
+    /// Callback for recording drift repair metrics.
+    metrics_callback: Option<DriftRepairCallback>,
 }

 impl DriftReconciler {
@ -98,6 +102,7 @@ impl DriftReconciler {
            node_master_key,
            pod_id,
            mode_a_coordinator: None,
+            metrics_callback: None,
        }
    }

@ -107,6 +112,12 @@ impl DriftReconciler {
        self
    }

+    /// Set the metrics callback for recording drift repairs.
+    pub fn with_metrics_callback(mut self, callback: DriftRepairCallback) -> Self {
+        self.metrics_callback = Some(callback);
+        self
+    }
+
    /// Start the background worker.
    ///
    /// This runs in a loop using Mode A coordination (plan §14.5):
@ -184,7 +195,7 @@ impl DriftReconciler {
        for address in &self.node_addresses {
            // Mode A coordination: only check pairs we own
            // Key is "index_uid:node_address" for rendezvous hashing
-            let pair_key = format!("{}:{}", index, address);
+            let pair_key = format!("{index}:{address}");

            if let Some(ref coordinator) = self.mode_a_coordinator {
                // Check if we own this (index, node) pair
@ -195,7 +206,7 @@ impl DriftReconciler {
                }
            }

-            let path = format!("/indexes/{}/settings", index);
+            let path = format!("/indexes/{index}/settings");
            match self.get_settings(client, address, &path).await {
                Ok(settings) => {
                    let hash = fingerprint_settings(&settings);
@ -250,7 +261,7 @@ impl DriftReconciler {
                    .iter()
                    .find(|(_addr, settings)| {
                        let hash = fingerprint_settings(settings);
-                        &hash == &consensus_hash
+                        hash == consensus_hash
                    })
                    .map(|(_, settings)| settings);

@ -258,12 +269,16 @@ impl DriftReconciler {
                    // Repair drifted nodes
                    for address in &drifted_nodes {
                        if let Err(e) = self
-                            .repair_node_settings(client, address, index, &consensus_settings)
+                            .repair_node_settings(client, address, index, consensus_settings)
                            .await
                        {
                            error!(node = %address, index = %index, error = %e, "failed to repair settings");
                        } else {
                            info!(node = %address, index = %index, "repaired settings drift");
+                            // Record metrics if callback is set
+                            if let Some(ref callback) = self.metrics_callback {
+                                callback(index);
+                            }
                        }
                    }
                }
@ -281,7 +296,7 @@ impl DriftReconciler {
        index: &str,
        settings: &Value,
    ) -> Result<()> {
-        let path = format!("/indexes/{}/settings", index);
+        let path = format!("/indexes/{index}/settings");
        let url = format!("{}{}", address.trim_end_matches('/'), path);

        let response = client
@ -290,7 +305,7 @@ impl DriftReconciler {
            .json(settings)
            .send()
            .await
-            .map_err(|e| MiroirError::InvalidState(format!("request failed: {}", e)))?;
+            .map_err(|e| MiroirError::InvalidState(format!("request failed: {e}")))?;

        if response.status().is_success() {
            Ok(())
@ -298,8 +313,7 @@ impl DriftReconciler {
            let status = response.status();
            let text = response.text().await.unwrap_or_default();
            Err(MiroirError::InvalidState(format!(
-                "repair failed: HTTP {} — {}",
-                status, text
+                "repair failed: HTTP {status} — {text}"
            )))
        }
    }
@ -313,7 +327,7 @@ impl DriftReconciler {
            .header("Authorization", format!("Bearer {}", self.node_master_key))
            .send()
            .await
-            .map_err(|e| MiroirError::InvalidState(format!("request failed: {}", e)))?;
+            .map_err(|e| MiroirError::InvalidState(format!("request failed: {e}")))?;

        if !response.status().is_success() {
            return Err(MiroirError::InvalidState(format!(
@ -325,7 +339,7 @@ impl DriftReconciler {
        let json: Value = response
            .json()
            .await
-            .map_err(|e| MiroirError::InvalidState(format!("parse response: {}", e)))?;
+            .map_err(|e| MiroirError::InvalidState(format!("parse response: {e}")))?;

        let indexes = json
            .get("results")
@ -350,7 +364,7 @@ impl DriftReconciler {
            .header("Authorization", format!("Bearer {}", self.node_master_key))
            .send()
            .await
-            .map_err(|e| MiroirError::InvalidState(format!("request failed: {}", e)))?;
+            .map_err(|e| MiroirError::InvalidState(format!("request failed: {e}")))?;

        if !response.status().is_success() {
            return Err(MiroirError::InvalidState(format!(
@ -362,7 +376,7 @@ impl DriftReconciler {
        response
            .json()
            .await
-            .map_err(|e| MiroirError::InvalidState(format!("parse response: {}", e)))
+            .map_err(|e| MiroirError::InvalidState(format!("parse response: {e}")))
    }
 }

--- a/crates/miroir-core/src/rebalancer_worker/mod.rs
+++ b/crates/miroir-core/src/rebalancer_worker/mod.rs
@ -18,7 +18,7 @@ mod acceptance_tests;
 mod settings_broadcast_acceptance_tests;

 pub use anti_entropy_worker::{AntiEntropyWorker, AntiEntropyWorkerConfig};
-pub use drift_reconciler::{DriftReconciler, DriftReconcilerConfig};
+pub use drift_reconciler::{DriftRepairCallback, DriftReconciler, DriftReconcilerConfig};

 use crate::migration::{MigrationCoordinator, MigrationId, MigrationNodeId, ShardId};
 use crate::rebalancer::{MigrationExecutor, Rebalancer, RebalancerMetrics};
@ -56,7 +56,7 @@ pub struct RebalanceJobId(pub String);
 impl RebalanceJobId {
    /// Create a new rebalance job ID for an index.
    pub fn new(index_uid: &str) -> Self {
-        Self(format!("rebalance:{}", index_uid))
+        Self(format!("rebalance:{index_uid}"))
    }

    /// Get the index UID from the job ID.
@ -315,7 +315,7 @@ impl RebalancerWorker {
            // Build scopes for each index: rebalance:<index>
            let scopes: Vec<String> = index_uids
                .into_iter()
-                .map(|uid| format!("rebalance:{}", uid))
+                .map(|uid| format!("rebalance:{uid}"))
                .collect();

            let mut acquired_any = false;
@ -424,11 +424,11 @@ impl RebalancerWorker {
                                }
                                Ok(Err(e)) => {
                                    error!(scope = %scope, error = %e, "failed to renew lease");
-                                    return Err(format!("lease renewal failed: {}", e));
+                                    return Err(format!("lease renewal failed: {e}"));
                                }
                                Err(e) => {
                                    error!(scope = %scope, error = %e, "spawn_blocking task failed");
-                                    return Err(format!("lease renewal task failed: {}", e));
+                                    return Err(format!("lease renewal task failed: {e}"));
                                }
                            }
                        }
@ -472,13 +472,13 @@ impl RebalancerWorker {

        // Derive the scope from the event to check leadership
        let scope = match &event {
-            TopologyChangeEvent::NodeAdded { index_uid, .. } => format!("rebalance:{}", index_uid),
+            TopologyChangeEvent::NodeAdded { index_uid, .. } => format!("rebalance:{index_uid}"),
            TopologyChangeEvent::NodeDraining { index_uid, .. } => {
-                format!("rebalance:{}", index_uid)
+                format!("rebalance:{index_uid}")
            }
-            TopologyChangeEvent::NodeFailed { index_uid, .. } => format!("rebalance:{}", index_uid),
+            TopologyChangeEvent::NodeFailed { index_uid, .. } => format!("rebalance:{index_uid}"),
            TopologyChangeEvent::NodeRecovered { index_uid, .. } => {
-                format!("rebalance:{}", index_uid)
+                format!("rebalance:{index_uid}")
            }
        };

@ -498,8 +498,8 @@ impl RebalancerWorker {
            }
        })
        .await
-        .map_err(|e| format!("failed to check leader lease: {}", e))?
-        .map_err(|e| format!("failed to check leader lease: {}", e))?;
+        .map_err(|e| format!("failed to check leader lease: {e}"))?
+        .map_err(|e| format!("failed to check leader lease: {e}"))?;

        if !is_leader {
            debug!(
@ -574,8 +574,8 @@ impl RebalancerWorker {
            move || task_store.list_jobs_by_state("running")
        })
        .await
-        .map_err(|e| format!("failed to list jobs: {}", e))?
-        .map_err(|e| format!("failed to list jobs: {}", e))?;
+        .map_err(|e| format!("failed to list jobs: {e}"))?
+        .map_err(|e| format!("failed to list jobs: {e}"))?;

        for existing_job in existing_jobs {
            if existing_job.id == job_id.0 {
@ -632,7 +632,7 @@ impl RebalancerWorker {
            let new_node = topo_to_migration_node_id(&TopologyNodeId::new(node_id.to_string()));
            coordinator
                .begin_migration(new_node, replica_group, old_owners)
-                .map_err(|e| format!("failed to create migration: {}", e))?
+                .map_err(|e| format!("failed to create migration: {e}"))?
        };

        // Start dual-write immediately so the router starts writing to both nodes
@ -640,7 +640,7 @@ impl RebalancerWorker {
            let mut coordinator = self.migration_coordinator.write().await;
            coordinator
                .begin_dual_write(migration_id)
-                .map_err(|e| format!("failed to start dual-write: {}", e))?;
+                .map_err(|e| format!("failed to start dual-write: {e}"))?;
        }

        let job = RebalanceJob {
@ -729,7 +729,7 @@ impl RebalancerWorker {
                let new_node = topo_to_migration_node_id(first_dest);
                coordinator
                    .begin_migration(new_node, replica_group, old_owners)
-                    .map_err(|e| format!("failed to create migration: {}", e))?
+                    .map_err(|e| format!("failed to create migration: {e}"))?
            } else {
                return Err("no shards to migrate".to_string());
            }
@ -740,7 +740,7 @@ impl RebalancerWorker {
            let mut coordinator = self.migration_coordinator.write().await;
            coordinator
                .begin_dual_write(migration_id)
-                .map_err(|e| format!("failed to start dual-write: {}", e))?;
+                .map_err(|e| format!("failed to start dual-write: {e}"))?;
        }

        let job = RebalanceJob {
@ -841,9 +841,9 @@ impl RebalancerWorker {
        let group = topo
            .groups()
            .find(|g| g.id == replica_group)
-            .ok_or_else(|| format!("replica group {} not found", replica_group))?;
+            .ok_or_else(|| format!("replica group {replica_group} not found"))?;

-        let existing_nodes: Vec<_> = group.nodes().iter().cloned().collect();
+        let existing_nodes: Vec<_> = group.nodes().to_vec();
        let mut affected_shards = Vec::new();

        // For each shard, check if adding the new node would change the assignment
@ -885,7 +885,7 @@ impl RebalancerWorker {
        let group = topo
            .groups()
            .find(|g| g.id == replica_group)
-            .ok_or_else(|| format!("replica group {} not found", replica_group))?;
+            .ok_or_else(|| format!("replica group {replica_group} not found"))?;

        let other_nodes: Vec<_> = group
            .nodes()
@ -1264,7 +1264,7 @@ impl RebalancerWorker {
    /// Persist a job to the task store.
    async fn persist_job(&self, job: &RebalanceJob) -> Result<(), String> {
        let progress =
-            serde_json::to_string(job).map_err(|e| format!("failed to serialize job: {}", e))?;
+            serde_json::to_string(job).map_err(|e| format!("failed to serialize job: {e}"))?;

        let new_job = NewJob {
            id: job.id.0.clone(),
@ -1298,8 +1298,8 @@ impl RebalancerWorker {
            move || task_store.insert_job(&new_job)
        })
        .await
-        .map_err(|e| format!("failed to persist job: {}", e))?
-        .map_err(|e| format!("failed to persist job: {}", e))?;
+        .map_err(|e| format!("failed to persist job: {e}"))?
+        .map_err(|e| format!("failed to persist job: {e}"))?;

        Ok(())
    }
@ -1317,7 +1317,7 @@ impl RebalancerWorker {
            };

            let progress_json = serde_json::to_string(&progress)
-                .map_err(|e| format!("failed to serialize progress: {}", e))?;
+                .map_err(|e| format!("failed to serialize progress: {e}"))?;

            // Update job progress in task store
            tokio::task::spawn_blocking({
@ -1328,8 +1328,8 @@ impl RebalancerWorker {
                move || task_store.update_job_progress(&job_id, &completed_at, &progress_json)
            })
            .await
-            .map_err(|e| format!("failed to update job progress: {}", e))?
-            .map_err(|e| format!("failed to update job progress: {}", e))?;
+            .map_err(|e| format!("failed to update job progress: {e}"))?
+            .map_err(|e| format!("failed to update job progress: {e}"))?;
        }

        Ok(())
@ -1348,7 +1348,7 @@ impl RebalancerWorker {
            let shard = ShardId(shard_id);

            // Look for a migration in the coordinator that affects this shard
-            for (_mid, migration_state) in coordinator.get_all_migrations() {
+            for migration_state in coordinator.get_all_migrations().values() {
                if let Some(migration_shard_state) = migration_state.affected_shards.get(&shard) {
                    // Sync the phase based on the migration coordinator state
                    use crate::migration::ShardMigrationState as CoordinatorState;
@ -1386,7 +1386,7 @@ impl RebalancerWorker {
        shard_id: u32,
    ) -> Result<(), String> {
        let shard = ShardId(shard_id);
-        let mut coordinator = self.migration_coordinator.write().await;
+        let coordinator = self.migration_coordinator.write().await;

        // Find or create the migration for this shard
        // For now, we'll create a new migration if one doesn't exist
@ -1465,7 +1465,7 @@ impl RebalancerWorker {
        let coordinator = self.migration_coordinator.read().await;

        // Check if the migration coordinator has marked this shard as complete
-        for (_mid, migration_state) in coordinator.get_all_migrations() {
+        for migration_state in coordinator.get_all_migrations().values() {
            if let Some(shard_state) = migration_state.affected_shards.get(&shard) {
                use crate::migration::ShardMigrationState as CoordinatorState;
                if matches!(shard_state, CoordinatorState::MigrationComplete { .. }) {
@ -1517,7 +1517,7 @@ impl RebalancerWorker {
                    offset,
                )
                .await
-                .map_err(|e| format!("fetch failed: {}", e))?;
+                .map_err(|e| format!("fetch failed: {e}"))?;

            if docs.is_empty() {
                break; // No more documents
@ -1527,7 +1527,7 @@ impl RebalancerWorker {
            executor
                .write_documents(new_node_id, new_address, index_uid, docs.clone())
                .await
-                .map_err(|e| format!("write failed: {}", e))?;
+                .map_err(|e| format!("write failed: {e}"))?;

            total_docs_copied += docs.len() as u64;
            offset += limit;
@ -1544,7 +1544,7 @@ impl RebalancerWorker {
            let mut coordinator = self.migration_coordinator.write().await;
            coordinator
                .shard_migration_complete(migration_id, ShardId(shard_id), total_docs_copied)
-                .map_err(|e| format!("failed to mark shard complete: {}", e))?;
+                .map_err(|e| format!("failed to mark shard complete: {e}"))?;
        }

        // Update metrics
@ -1580,7 +1580,7 @@ impl RebalancerWorker {
            info!(index_uid = %index_uid, "paused rebalance");
            Ok(())
        } else {
-            Err(format!("no rebalance job found for index {}", index_uid))
+            Err(format!("no rebalance job found for index {index_uid}"))
        }
    }

@ -1594,7 +1594,7 @@ impl RebalancerWorker {
            info!(index_uid = %index_uid, "resumed rebalance");
            Ok(())
        } else {
-            Err(format!("no rebalance job found for index {}", index_uid))
+            Err(format!("no rebalance job found for index {index_uid}"))
        }
    }

@ -1605,8 +1605,8 @@ impl RebalancerWorker {
            move || task_store.list_jobs_by_state("running")
        })
        .await
-        .map_err(|e| format!("failed to list jobs: {}", e))?
-        .map_err(|e| format!("failed to list jobs: {}", e))?;
+        .map_err(|e| format!("failed to list jobs: {e}"))?
+        .map_err(|e| format!("failed to list jobs: {e}"))?;

        for job_row in jobs {
            if job_row.type_ == "rebalance" {
--- a/crates/miroir-proxy/src/routes/admin_endpoints.rs
+++ b/crates/miroir-proxy/src/routes/admin_endpoints.rs
@ -567,6 +567,11 @@ impl AppState {
                lease_ttl_secs: 10,
                lease_renewal_interval_ms: 2000,
            };
+            let metrics_clone = metrics.clone();
+            let callback: miroir_core::rebalancer_worker::DriftRepairCallback =
+                Arc::new(move |index: &str| {
+                    metrics_clone.inc_settings_drift_repair(index);
+                });
            Some(Arc::new(
                miroir_core::rebalancer_worker::DriftReconciler::new(
                    drift_config,
@ -575,7 +580,8 @@ impl AppState {
                    node_addresses,
                    config.node_master_key.clone(),
                    pod_id.clone(),
-                ),
+                )
+                .with_metrics_callback(callback),
            ))
        } else {
            None