From 5ff160e80f54294ead9f284b41d4691bb3bfc84a Mon Sep 17 00:00:00 2001 From: jedarden Date: Fri, 24 Apr 2026 13:27:38 -0400 Subject: [PATCH] =?UTF-8?q?P7:=20readiness=20probe=20=E2=86=92=20/=5Fmiroi?= =?UTF-8?q?r/ready,=20fix=20PeerDiscoveryGap=20alert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wire readinessProbe to /_miroir/ready (returns 503 until covering quorum reachable) instead of /health (always 200) - Fix MiroirPeerDiscoveryGap alert to use miroir_peer_pod_count metric instead of non-existent miroir_peer_known - Align MiroirHighSearchLatency, MiroirSettingsDivergence, and MiroirAntientropyMismatch alert expressions with registered metric names per plan §10 Co-Authored-By: Claude Opus 4.7 --- .../miroir/templates/miroir-deployment.yaml | 2 +- .../templates/miroir-prometheusrule.yaml | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/charts/miroir/templates/miroir-deployment.yaml b/charts/miroir/templates/miroir-deployment.yaml index 6baca95..4e97c8f 100644 --- a/charts/miroir/templates/miroir-deployment.yaml +++ b/charts/miroir/templates/miroir-deployment.yaml @@ -137,7 +137,7 @@ spec: failureThreshold: 3 readinessProbe: httpGet: - path: /health + path: /_miroir/ready port: http initialDelaySeconds: 5 periodSeconds: 5 diff --git a/charts/miroir/templates/miroir-prometheusrule.yaml b/charts/miroir/templates/miroir-prometheusrule.yaml index 7357ed5..3b45272 100644 --- a/charts/miroir/templates/miroir-prometheusrule.yaml +++ b/charts/miroir/templates/miroir-prometheusrule.yaml @@ -40,12 +40,12 @@ spec: {{ "{{ $labels.replica_group }}" }} has been unhealthy for 5 minutes. - alert: MiroirHighSearchLatency - expr: histogram_quantile(0.95, sum(rate(miroir_search_duration_seconds_bucket[5m])) by (le)) > 2 + expr: histogram_quantile(0.95, sum(rate(miroir_request_duration_seconds_bucket{path_template="/indexes/{uid}/search"}[5m])) by (le)) > 2.0 for: 5m labels: severity: warning annotations: - summary: "Miroir search latency is high" + summary: "Miroir p95 search latency exceeds 2s" description: >- p95 search latency is {{ "{{ $value | humanizeDuration }}" }}, exceeding the 2s threshold. @@ -73,26 +73,27 @@ spec: This usually indicates a stuck migration. - alert: MiroirSettingsDivergence - expr: count(count by (setting) (miroir_node_setting_value)) by (setting) > 1 - for: 15m + expr: increase(miroir_settings_hash_mismatch_total[10m]) > 0 and miroir_settings_drift_repair_total == 0 + for: 10m labels: severity: warning annotations: summary: "Miroir settings divergence detected" description: >- - Setting {{ "{{ $labels.setting }}" }} has divergent values across nodes. - Self-healing (§13.5) should auto-repair; alert fires when it hasn't. + Settings divergence was observed but the auto-repair counter did not + advance, suggesting repair is disabled or failing. + Cross-reference §13.5 two-phase settings broadcast. - alert: MiroirAntientropyMismatch - expr: increase(miroir_antientropy_mismatch_total[6h]) >= 3 - for: 0m + expr: increase(miroir_antientropy_mismatches_found_total[18h]) > 0 + for: 18h labels: severity: warning annotations: summary: "Miroir anti-entropy found persistent mismatches" description: >- - Anti-entropy repair has detected mismatches on - {{ "{{ $value }}" }} consecutive passes (≈18h default schedule). + Anti-entropy reconciler found replica divergence persisting across + 3 consecutive passes at default every-6h schedule (≈18h). Self-healing (§13.8) failed to close the gap. - name: miroir.resource_pressure @@ -130,7 +131,7 @@ spec: exceeding the 100 threshold for 10 minutes. - alert: MiroirPeerDiscoveryGap - expr: count(miroir_peer_known) by (namespace, job) != count(miroir_node_healthy == 1) by (namespace, job) + expr: miroir_peer_pod_count < kube_deployment_status_replicas_ready{deployment="{{ include "miroir.fullname" . }}-miroir"} for: 2m labels: severity: warning