P7.4: Add ServiceMonitor and PrometheusRule manifests (plan §10 + §14.9)

ServiceMonitor scrapes the metrics port (9090) at 30s intervals.
PrometheusRule ships all 12 alerts: 7 availability (degraded shards,
node down, high latency, stuck tasks, stuck rebalance, settings
divergence, anti-entropy mismatch) + 5 resource pressure (memory,
request queue, background queue, peer discovery, no leader).

Both gated behind serviceMonitor.enabled / prometheusRule.enabled
(defaults: false — requires prometheus-operator in cluster).

Also adds metrics port to the miroir Service so ServiceMonitor can
select it.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-04-19 11:42:35 -04:00
parent 13d4430d2a
commit ea6be6a339
5 changed files with 501 additions and 2 deletions

View file

@ -0,0 +1,152 @@
{{/*
Miroir PrometheusRule — alerting rules (plan §10 + §14.9)
*/}}
{{- if .Values.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "miroir.fullname" . }}
labels:
{{- include "miroir.labels" . | nindent 4 }}
app.kubernetes.io/component: metrics
{{- with .Values.prometheusRule.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- name: miroir.availability
rules:
- alert: MiroirDegradedShards
expr: miroir_degraded_shards_total > 0
for: 2m
labels:
severity: warning
annotations:
summary: "Miroir has degraded shards"
description: >-
{{ include "miroir.fullname" . }} has {{ "{{ $value }}" }} degraded shard(s).
Self-healing should resolve this; alert fires when healing failed.
- alert: MiroirNodeDown
expr: miroir_node_healthy == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Miroir node is down"
description: >-
Node {{ "{{ $labels.node_id }}" }} in replica group
{{ "{{ $labels.replica_group }}" }} has been unhealthy for 5 minutes.
- alert: MiroirHighSearchLatency
expr: histogram_quantile(0.95, sum(rate(miroir_search_duration_seconds_bucket[5m])) by (le)) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Miroir search latency is high"
description: >-
p95 search latency is {{ "{{ $value | humanizeDuration }}" }},
exceeding the 2s threshold.
- alert: MiroirTaskStuck
expr: miroir_task_processing_age_seconds > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Miroir task stuck in processing"
description: >-
Task {{ "{{ $labels.task_type }}" }} has been processing for
{{ "{{ $value | humanizeDuration }}" }}.
- alert: MiroirRebalanceStuck
expr: miroir_rebalance_in_progress == 1
for: 2h
labels:
severity: warning
annotations:
summary: "Miroir rebalance has been running for over 2 hours"
description: >-
A shard rebalance has been in progress for {{ "{{ $value }}" }}.
This usually indicates a stuck migration.
- alert: MiroirSettingsDivergence
expr: count(count by (setting) (miroir_node_setting_value)) by (setting) > 1
for: 15m
labels:
severity: warning
annotations:
summary: "Miroir settings divergence detected"
description: >-
Setting {{ "{{ $labels.setting }}" }} has divergent values across nodes.
Self-healing (§13.5) should auto-repair; alert fires when it hasn't.
- alert: MiroirAntientropyMismatch
expr: increase(miroir_antientropy_mismatch_total[6h]) >= 3
for: 0m
labels:
severity: warning
annotations:
summary: "Miroir anti-entropy found persistent mismatches"
description: >-
Anti-entropy repair has detected mismatches on
{{ "{{ $value }}" }} consecutive passes (≈18h default schedule).
Self-healing (§13.8) failed to close the gap.
- name: miroir.resource_pressure
rules:
- alert: MiroirMemoryPressure
expr: miroir_memory_pressure >= 2
for: 5m
labels:
severity: warning
annotations:
summary: "Miroir memory pressure is elevated"
description: >-
Memory pressure level {{ "{{ $value }}" }} (>=2 means moderate or higher)
sustained for 5 minutes.
- alert: MiroirRequestQueueBacklog
expr: miroir_request_queue_depth > 500
for: 2m
labels:
severity: warning
annotations:
summary: "Miroir request queue backlog is high"
description: >-
Request queue depth is {{ "{{ $value }}" }}, exceeding the 500 threshold.
- alert: MiroirBackgroundJobBacklog
expr: miroir_background_queue_depth > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Miroir background job queue backlog is high"
description: >-
Background job queue depth is {{ "{{ $value }}" }},
exceeding the 100 threshold for 10 minutes.
- alert: MiroirPeerDiscoveryGap
expr: count(miroir_peer_known) by (namespace, job) != count(miroir_node_healthy == 1) by (namespace, job)
for: 2m
labels:
severity: warning
annotations:
summary: "Miroir peer discovery is out of sync"
description: >-
Number of discovered peers does not match number of healthy nodes.
- alert: MiroirNoLeader
expr: sum(miroir_leader) == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Miroir has no elected leader"
description: >-
No miroir instance holds the leader lease.
Cluster coordination is stalled.
{{- end }}

View file

@ -0,0 +1,30 @@
{{/*
Miroir Service
*/}}
{{- if .Values.miroir.replicas }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "miroir.fullname" . }}
labels:
{{- include "miroir.labels" . | nindent 4 }}
app.kubernetes.io/component: miroir
{{- with .Values.service.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.service.type | default "ClusterIP" }}
ports:
- port: {{ .Values.service.ports.http | default 7700 }}
targetPort: http
protocol: TCP
name: http
- port: {{ .Values.service.ports.metrics | default 9090 }}
targetPort: metrics
protocol: TCP
name: metrics
selector:
{{- include "miroir.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: miroir
{{- end }}

View file

@ -0,0 +1,28 @@
{{/*
Miroir ServiceMonitor (requires prometheus-operator)
*/}}
{{- if .Values.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "miroir.fullname" . }}
labels:
{{- include "miroir.labels" . | nindent 4 }}
app.kubernetes.io/component: metrics
{{- with .Values.serviceMonitor.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
selector:
matchLabels:
{{- include "miroir.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: miroir
endpoints:
- port: metrics
interval: {{ .Values.serviceMonitor.interval | default "30s" }}
path: /metrics
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- end }}

View file

@ -10,7 +10,8 @@
"type": "object",
"properties": {
"repository": { "type": "string" },
"tag": { "type": "string" }
"tag": { "type": "string" },
"pullPolicy": { "type": "string", "enum": ["Always", "IfNotPresent", "Never"] }
}
},
"replicas": {
@ -29,7 +30,19 @@
"type": "integer",
"minimum": 1
},
"existingSecret": { "type": "string" }
"existingSecret": { "type": "string" },
"podAnnotations": { "type": "object" },
"podLabels": { "type": "object" },
"resources": {
"type": "object",
"properties": {
"limits": { "type": "object" },
"requests": { "type": "object" }
}
},
"nodeSelector": { "type": "object" },
"tolerations": { "type": "array" },
"affinity": { "type": "object" }
}
},
"taskStore": {
@ -43,6 +56,145 @@
"url": { "type": "string" }
},
"required": ["backend"]
},
"hpa": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"minReplicas": { "type": "integer", "minimum": 1 },
"maxReplicas": { "type": "integer", "minimum": 1 },
"targetCPUUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
"targetMemoryUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
"behavior": { "type": "object" }
}
},
"tracing": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"endpoint": { "type": "string" },
"serviceName": { "type": "string" },
"sampleRate": { "type": "number", "minimum": 0, "maximum": 1 }
}
},
"serviceAccount": {
"type": "object",
"properties": {
"create": { "type": "boolean" },
"name": { "type": "string" },
"annotations": { "type": "object" }
}
},
"service": {
"type": "object",
"properties": {
"type": { "type": "string" },
"annotations": { "type": "object" },
"ports": {
"type": "object",
"properties": {
"http": { "type": "integer", "minimum": 1, "maximum": 65535 },
"metrics": { "type": "integer", "minimum": 1, "maximum": 65535 }
}
}
}
},
"serviceMonitor": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"interval": { "type": "string" },
"annotations": { "type": "object" }
}
},
"prometheusRule": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"annotations": { "type": "object" }
}
},
"headless": {
"type": "object",
"properties": {
"annotations": { "type": "object" }
}
},
"meilisearch": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"image": {
"type": "object",
"properties": {
"repository": { "type": "string" },
"tag": { "type": "string" },
"pullPolicy": { "type": "string" }
}
},
"replicas": { "type": "integer", "minimum": 1 },
"nodesPerGroup": { "type": "integer", "minimum": 1 },
"podAnnotations": { "type": "object" },
"podLabels": { "type": "object" },
"resources": { "type": "object" },
"nodeSelector": { "type": "object" },
"tolerations": { "type": "array" },
"affinity": { "type": "object" },
"persistence": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"size": { "type": "string" },
"storageClass": { "type": "string" }
}
},
"env": { "type": "array" },
"masterKey": { "type": "string" }
}
},
"redis": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"image": {
"type": "object",
"properties": {
"repository": { "type": "string" },
"tag": { "type": "string" },
"pullPolicy": { "type": "string" }
}
},
"replicas": { "type": "integer", "minimum": 1 },
"podAnnotations": { "type": "object" },
"podLabels": { "type": "object" },
"resources": { "type": "object" },
"nodeSelector": { "type": "object" },
"tolerations": { "type": "array" },
"affinity": { "type": "object" },
"persistence": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"size": { "type": "string" },
"storageClass": { "type": "string" }
}
},
"service": {
"type": "object",
"properties": {
"type": { "type": "string" },
"port": { "type": "integer", "minimum": 1, "maximum": 65535 }
}
}
}
},
"cdcPvc": {
"type": "object",
"properties": {
"enabled": { "type": "boolean" },
"size": { "type": "string" },
"storageClass": { "type": "string" }
}
}
},
"if": {

View file

@ -1,16 +1,153 @@
# Miroir Helm Chart Values
# These defaults boot a working single-pod install for evaluation and CI.
# For production, override to: replicas=2+, replicationFactor=2, replicaGroups=2,
# taskStore.backend=redis, redis.enabled=true, hpa.enabled=true
miroir:
image:
repository: ghcr.io/jedarden/miroir
tag: "" # defaults to Chart.appVersion
pullPolicy: IfNotPresent
replicas: 1 # dev default: override to 2+ in production (requires taskStore.backend=redis)
shards: 64
replicationFactor: 1 # dev default: override to 2 in production
replicaGroups: 1 # dev default: override to 2 in production
existingSecret: "" # name of K8s Secret with masterKey, nodeMasterKey, adminApiKey
podAnnotations: {}
podLabels: {}
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 250m
memory: 512Mi
nodeSelector: {}
tolerations: []
affinity: {}
taskStore:
backend: sqlite # sqlite | redis
path: /data/miroir-tasks.db
url: "" # for redis: redis://host:6379
# Horizontal Pod Autoscaler (disabled by default for dev)
hpa:
enabled: false
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Max
# ServiceAccount
serviceAccount:
create: true
name: "" # defaults to release name
annotations: {}
# Services
service:
type: ClusterIP
annotations: {}
ports:
http: 7700
metrics: 9090
headless:
annotations: {}
# Meilisearch StatefulSet
meilisearch:
enabled: true
image:
repository: getmeilisearch/meilisearch
tag: v1.12
pullPolicy: IfNotPresent
replicas: 2 # 1 group × 2 nodes (dev default)
nodesPerGroup: 2 # nodes per replica group
podAnnotations: {}
podLabels: {}
resources:
limits:
cpu: 2000m
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
nodeSelector: {}
tolerations: []
affinity: {}
persistence:
enabled: true
size: 10Gi
storageClass: "" # uses default storage class
env: []
masterKey: "" # defaults to auto-generated
# Redis deployment (only when taskStore.backend=redis)
redis:
enabled: false # dev default: enable for production
image:
repository: redis
tag: 7-alpine
pullPolicy: IfNotPresent
replicas: 1
podAnnotations: {}
podLabels: {}
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 100m
memory: 128Mi
nodeSelector: {}
tolerations: []
affinity: {}
persistence:
enabled: true
size: 1Gi
storageClass: ""
service:
type: ClusterIP
port: 6379
# PVC for CDC buffer (when cdc.buffer.primary=pvc or overflow=pvc)
cdcPvc:
enabled: false
size: 5Gi
storageClass: ""
# Prometheus Operator integration (plan §10 + §14.9)
serviceMonitor:
enabled: false # requires prometheus-operator in cluster
interval: 30s
annotations: {}
prometheusRule:
enabled: false # requires prometheus-operator in cluster
annotations: {}
# OpenTelemetry tracing (plan §10)
tracing:
enabled: false # disabled by default for zero overhead
endpoint: "http://tempo.monitoring.svc:4317" # OTLP gRPC endpoint
serviceName: "miroir" # service name for trace identification
sampleRate: 0.1 # head-based sampling: 0.1 = ~10% of requests traced