P7.4: Add ServiceMonitor and PrometheusRule manifests (plan §10 + §14.9)

ServiceMonitor scrapes the metrics port (9090) at 30s intervals. PrometheusRule ships all 12 alerts: 7 availability (degraded shards, node down, high latency, stuck tasks, stuck rebalance, settings divergence, anti-entropy mismatch) + 5 resource pressure (memory, request queue, background queue, peer discovery, no leader). Both gated behind serviceMonitor.enabled / prometheusRule.enabled (defaults: false — requires prometheus-operator in cluster). Also adds metrics port to the miroir Service so ServiceMonitor can select it. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-19 11:42:35 -04:00 · 2026-04-19 11:42:35 -04:00 · ea6be6a339
commit ea6be6a339
parent 13d4430d2a
5 changed files with 501 additions and 2 deletions
--- a/charts/miroir/templates/miroir-prometheusrule.yaml
+++ b/charts/miroir/templates/miroir-prometheusrule.yaml
@ -0,0 +1,152 @@
+{{/*
+Miroir PrometheusRule — alerting rules (plan §10 + §14.9)
+*/}}
+{{- if .Values.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ include "miroir.fullname" . }}
+  labels:
+    {{- include "miroir.labels" . | nindent 4 }}
+    app.kubernetes.io/component: metrics
+  {{- with .Values.prometheusRule.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  groups:
+    - name: miroir.availability
+      rules:
+        - alert: MiroirDegradedShards
+          expr: miroir_degraded_shards_total > 0
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir has degraded shards"
+            description: >-
+              {{ include "miroir.fullname" . }} has {{ "{{ $value }}" }} degraded shard(s).
+              Self-healing should resolve this; alert fires when healing failed.
+
+        - alert: MiroirNodeDown
+          expr: miroir_node_healthy == 0
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Miroir node is down"
+            description: >-
+              Node {{ "{{ $labels.node_id }}" }} in replica group
+              {{ "{{ $labels.replica_group }}" }} has been unhealthy for 5 minutes.
+
+        - alert: MiroirHighSearchLatency
+          expr: histogram_quantile(0.95, sum(rate(miroir_search_duration_seconds_bucket[5m])) by (le)) > 2
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir search latency is high"
+            description: >-
+              p95 search latency is {{ "{{ $value | humanizeDuration }}" }},
+              exceeding the 2s threshold.
+
+        - alert: MiroirTaskStuck
+          expr: miroir_task_processing_age_seconds > 3600
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir task stuck in processing"
+            description: >-
+              Task {{ "{{ $labels.task_type }}" }} has been processing for
+              {{ "{{ $value | humanizeDuration }}" }}.
+
+        - alert: MiroirRebalanceStuck
+          expr: miroir_rebalance_in_progress == 1
+          for: 2h
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir rebalance has been running for over 2 hours"
+            description: >-
+              A shard rebalance has been in progress for {{ "{{ $value }}" }}.
+              This usually indicates a stuck migration.
+
+        - alert: MiroirSettingsDivergence
+          expr: count(count by (setting) (miroir_node_setting_value)) by (setting) > 1
+          for: 15m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir settings divergence detected"
+            description: >-
+              Setting {{ "{{ $labels.setting }}" }} has divergent values across nodes.
+              Self-healing (§13.5) should auto-repair; alert fires when it hasn't.
+
+        - alert: MiroirAntientropyMismatch
+          expr: increase(miroir_antientropy_mismatch_total[6h]) >= 3
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir anti-entropy found persistent mismatches"
+            description: >-
+              Anti-entropy repair has detected mismatches on
+              {{ "{{ $value }}" }} consecutive passes (≈18h default schedule).
+              Self-healing (§13.8) failed to close the gap.
+
+    - name: miroir.resource_pressure
+      rules:
+        - alert: MiroirMemoryPressure
+          expr: miroir_memory_pressure >= 2
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir memory pressure is elevated"
+            description: >-
+              Memory pressure level {{ "{{ $value }}" }} (>=2 means moderate or higher)
+              sustained for 5 minutes.
+
+        - alert: MiroirRequestQueueBacklog
+          expr: miroir_request_queue_depth > 500
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir request queue backlog is high"
+            description: >-
+              Request queue depth is {{ "{{ $value }}" }}, exceeding the 500 threshold.
+
+        - alert: MiroirBackgroundJobBacklog
+          expr: miroir_background_queue_depth > 100
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir background job queue backlog is high"
+            description: >-
+              Background job queue depth is {{ "{{ $value }}" }},
+              exceeding the 100 threshold for 10 minutes.
+
+        - alert: MiroirPeerDiscoveryGap
+          expr: count(miroir_peer_known) by (namespace, job) != count(miroir_node_healthy == 1) by (namespace, job)
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Miroir peer discovery is out of sync"
+            description: >-
+              Number of discovered peers does not match number of healthy nodes.
+
+        - alert: MiroirNoLeader
+          expr: sum(miroir_leader) == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Miroir has no elected leader"
+            description: >-
+              No miroir instance holds the leader lease.
+              Cluster coordination is stalled.
+{{- end }}
--- a/charts/miroir/templates/miroir-service.yaml
+++ b/charts/miroir/templates/miroir-service.yaml
@ -0,0 +1,30 @@
+{{/*
+Miroir Service
+*/}}
+{{- if .Values.miroir.replicas }}
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "miroir.fullname" . }}
+  labels:
+    {{- include "miroir.labels" . | nindent 4 }}
+    app.kubernetes.io/component: miroir
+  {{- with .Values.service.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  type: {{ .Values.service.type | default "ClusterIP" }}
+  ports:
+    - port: {{ .Values.service.ports.http | default 7700 }}
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: {{ .Values.service.ports.metrics | default 9090 }}
+      targetPort: metrics
+      protocol: TCP
+      name: metrics
+  selector:
+    {{- include "miroir.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: miroir
+{{- end }}
--- a/charts/miroir/templates/miroir-servicemonitor.yaml
+++ b/charts/miroir/templates/miroir-servicemonitor.yaml
@ -0,0 +1,28 @@
+{{/*
+Miroir ServiceMonitor (requires prometheus-operator)
+*/}}
+{{- if .Values.serviceMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "miroir.fullname" . }}
+  labels:
+    {{- include "miroir.labels" . | nindent 4 }}
+    app.kubernetes.io/component: metrics
+  {{- with .Values.serviceMonitor.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "miroir.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: miroir
+  endpoints:
+    - port: metrics
+      interval: {{ .Values.serviceMonitor.interval | default "30s" }}
+      path: /metrics
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace }}
+{{- end }}
--- a/charts/miroir/values.schema.json
+++ b/charts/miroir/values.schema.json
@ -10,7 +10,8 @@
          "type": "object",
          "properties": {
            "repository": { "type": "string" },
-            "tag": { "type": "string" }
+            "tag": { "type": "string" },
+            "pullPolicy": { "type": "string", "enum": ["Always", "IfNotPresent", "Never"] }
          }
        },
        "replicas": {
@ -29,7 +30,19 @@
          "type": "integer",
          "minimum": 1
        },
-        "existingSecret": { "type": "string" }
+        "existingSecret": { "type": "string" },
+        "podAnnotations": { "type": "object" },
+        "podLabels": { "type": "object" },
+        "resources": {
+          "type": "object",
+          "properties": {
+            "limits": { "type": "object" },
+            "requests": { "type": "object" }
+          }
+        },
+        "nodeSelector": { "type": "object" },
+        "tolerations": { "type": "array" },
+        "affinity": { "type": "object" }
      }
    },
    "taskStore": {
@ -43,6 +56,145 @@
        "url": { "type": "string" }
      },
      "required": ["backend"]
+    },
+    "hpa": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "minReplicas": { "type": "integer", "minimum": 1 },
+        "maxReplicas": { "type": "integer", "minimum": 1 },
+        "targetCPUUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
+        "targetMemoryUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
+        "behavior": { "type": "object" }
+      }
+    },
+    "tracing": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "endpoint": { "type": "string" },
+        "serviceName": { "type": "string" },
+        "sampleRate": { "type": "number", "minimum": 0, "maximum": 1 }
+      }
+    },
+    "serviceAccount": {
+      "type": "object",
+      "properties": {
+        "create": { "type": "boolean" },
+        "name": { "type": "string" },
+        "annotations": { "type": "object" }
+      }
+    },
+    "service": {
+      "type": "object",
+      "properties": {
+        "type": { "type": "string" },
+        "annotations": { "type": "object" },
+        "ports": {
+          "type": "object",
+          "properties": {
+            "http": { "type": "integer", "minimum": 1, "maximum": 65535 },
+            "metrics": { "type": "integer", "minimum": 1, "maximum": 65535 }
+          }
+        }
+      }
+    },
+    "serviceMonitor": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "interval": { "type": "string" },
+        "annotations": { "type": "object" }
+      }
+    },
+    "prometheusRule": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "annotations": { "type": "object" }
+      }
+    },
+    "headless": {
+      "type": "object",
+      "properties": {
+        "annotations": { "type": "object" }
+      }
+    },
+    "meilisearch": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "image": {
+          "type": "object",
+          "properties": {
+            "repository": { "type": "string" },
+            "tag": { "type": "string" },
+            "pullPolicy": { "type": "string" }
+          }
+        },
+        "replicas": { "type": "integer", "minimum": 1 },
+        "nodesPerGroup": { "type": "integer", "minimum": 1 },
+        "podAnnotations": { "type": "object" },
+        "podLabels": { "type": "object" },
+        "resources": { "type": "object" },
+        "nodeSelector": { "type": "object" },
+        "tolerations": { "type": "array" },
+        "affinity": { "type": "object" },
+        "persistence": {
+          "type": "object",
+          "properties": {
+            "enabled": { "type": "boolean" },
+            "size": { "type": "string" },
+            "storageClass": { "type": "string" }
+          }
+        },
+        "env": { "type": "array" },
+        "masterKey": { "type": "string" }
+      }
+    },
+    "redis": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "image": {
+          "type": "object",
+          "properties": {
+            "repository": { "type": "string" },
+            "tag": { "type": "string" },
+            "pullPolicy": { "type": "string" }
+          }
+        },
+        "replicas": { "type": "integer", "minimum": 1 },
+        "podAnnotations": { "type": "object" },
+        "podLabels": { "type": "object" },
+        "resources": { "type": "object" },
+        "nodeSelector": { "type": "object" },
+        "tolerations": { "type": "array" },
+        "affinity": { "type": "object" },
+        "persistence": {
+          "type": "object",
+          "properties": {
+            "enabled": { "type": "boolean" },
+            "size": { "type": "string" },
+            "storageClass": { "type": "string" }
+          }
+        },
+        "service": {
+          "type": "object",
+          "properties": {
+            "type": { "type": "string" },
+            "port": { "type": "integer", "minimum": 1, "maximum": 65535 }
+          }
+        }
+      }
+    },
+    "cdcPvc": {
+      "type": "object",
+      "properties": {
+        "enabled": { "type": "boolean" },
+        "size": { "type": "string" },
+        "storageClass": { "type": "string" }
+      }
    }
  },
  "if": {
--- a/charts/miroir/values.yaml
+++ b/charts/miroir/values.yaml
@ -1,16 +1,153 @@
 # Miroir Helm Chart Values
+# These defaults boot a working single-pod install for evaluation and CI.
+# For production, override to: replicas=2+, replicationFactor=2, replicaGroups=2,
+# taskStore.backend=redis, redis.enabled=true, hpa.enabled=true

 miroir:
  image:
    repository: ghcr.io/jedarden/miroir
    tag: ""  # defaults to Chart.appVersion
+    pullPolicy: IfNotPresent
  replicas: 1  # dev default: override to 2+ in production (requires taskStore.backend=redis)
  shards: 64
  replicationFactor: 1  # dev default: override to 2 in production
  replicaGroups: 1  # dev default: override to 2 in production
  existingSecret: ""  # name of K8s Secret with masterKey, nodeMasterKey, adminApiKey
+  podAnnotations: {}
+  podLabels: {}
+  resources:
+    limits:
+      cpu: 1000m
+      memory: 1Gi
+    requests:
+      cpu: 250m
+      memory: 512Mi
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}

 taskStore:
  backend: sqlite  # sqlite | redis
  path: /data/miroir-tasks.db
  url: ""  # for redis: redis://host:6379
+
+# Horizontal Pod Autoscaler (disabled by default for dev)
+hpa:
+  enabled: false
+  minReplicas: 2
+  maxReplicas: 10
+  targetCPUUtilizationPercentage: 70
+  targetMemoryUtilizationPercentage: 80
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 300
+      policies:
+        - type: Percent
+          value: 50
+          periodSeconds: 60
+    scaleUp:
+      stabilizationWindowSeconds: 0
+      policies:
+        - type: Percent
+          value: 100
+          periodSeconds: 30
+        - type: Pods
+          value: 2
+          periodSeconds: 60
+      selectPolicy: Max
+
+# ServiceAccount
+serviceAccount:
+  create: true
+  name: ""  # defaults to release name
+  annotations: {}
+
+# Services
+service:
+  type: ClusterIP
+  annotations: {}
+  ports:
+    http: 7700
+    metrics: 9090
+
+headless:
+  annotations: {}
+
+# Meilisearch StatefulSet
+meilisearch:
+  enabled: true
+  image:
+    repository: getmeilisearch/meilisearch
+    tag: v1.12
+    pullPolicy: IfNotPresent
+  replicas: 2  # 1 group × 2 nodes (dev default)
+  nodesPerGroup: 2  # nodes per replica group
+  podAnnotations: {}
+  podLabels: {}
+  resources:
+    limits:
+      cpu: 2000m
+      memory: 2Gi
+    requests:
+      cpu: 500m
+      memory: 1Gi
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
+  persistence:
+    enabled: true
+    size: 10Gi
+    storageClass: ""  # uses default storage class
+  env: []
+  masterKey: ""  # defaults to auto-generated
+
+# Redis deployment (only when taskStore.backend=redis)
+redis:
+  enabled: false  # dev default: enable for production
+  image:
+    repository: redis
+    tag: 7-alpine
+    pullPolicy: IfNotPresent
+  replicas: 1
+  podAnnotations: {}
+  podLabels: {}
+  resources:
+    limits:
+      cpu: 500m
+      memory: 512Mi
+    requests:
+      cpu: 100m
+      memory: 128Mi
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
+  persistence:
+    enabled: true
+    size: 1Gi
+    storageClass: ""
+  service:
+    type: ClusterIP
+    port: 6379
+
+# PVC for CDC buffer (when cdc.buffer.primary=pvc or overflow=pvc)
+cdcPvc:
+  enabled: false
+  size: 5Gi
+  storageClass: ""
+
+# Prometheus Operator integration (plan §10 + §14.9)
+serviceMonitor:
+  enabled: false  # requires prometheus-operator in cluster
+  interval: 30s
+  annotations: {}
+
+prometheusRule:
+  enabled: false  # requires prometheus-operator in cluster
+  annotations: {}
+
+# OpenTelemetry tracing (plan §10)
+tracing:
+  enabled: false  # disabled by default for zero overhead
+  endpoint: "http://tempo.monitoring.svc:4317"  # OTLP gRPC endpoint
+  serviceName: "miroir"  # service name for trace identification
+  sampleRate: 0.1  # head-based sampling: 0.1 = ~10% of requests traced