P7.4: Add ServiceMonitor and PrometheusRule manifests (plan §10 + §14.9)
ServiceMonitor scrapes the metrics port (9090) at 30s intervals. PrometheusRule ships all 12 alerts: 7 availability (degraded shards, node down, high latency, stuck tasks, stuck rebalance, settings divergence, anti-entropy mismatch) + 5 resource pressure (memory, request queue, background queue, peer discovery, no leader). Both gated behind serviceMonitor.enabled / prometheusRule.enabled (defaults: false — requires prometheus-operator in cluster). Also adds metrics port to the miroir Service so ServiceMonitor can select it. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
13d4430d2a
commit
ea6be6a339
5 changed files with 501 additions and 2 deletions
152
charts/miroir/templates/miroir-prometheusrule.yaml
Normal file
152
charts/miroir/templates/miroir-prometheusrule.yaml
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
{{/*
|
||||
Miroir PrometheusRule — alerting rules (plan §10 + §14.9)
|
||||
*/}}
|
||||
{{- if .Values.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ include "miroir.fullname" . }}
|
||||
labels:
|
||||
{{- include "miroir.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: metrics
|
||||
{{- with .Values.prometheusRule.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: miroir.availability
|
||||
rules:
|
||||
- alert: MiroirDegradedShards
|
||||
expr: miroir_degraded_shards_total > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir has degraded shards"
|
||||
description: >-
|
||||
{{ include "miroir.fullname" . }} has {{ "{{ $value }}" }} degraded shard(s).
|
||||
Self-healing should resolve this; alert fires when healing failed.
|
||||
|
||||
- alert: MiroirNodeDown
|
||||
expr: miroir_node_healthy == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Miroir node is down"
|
||||
description: >-
|
||||
Node {{ "{{ $labels.node_id }}" }} in replica group
|
||||
{{ "{{ $labels.replica_group }}" }} has been unhealthy for 5 minutes.
|
||||
|
||||
- alert: MiroirHighSearchLatency
|
||||
expr: histogram_quantile(0.95, sum(rate(miroir_search_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir search latency is high"
|
||||
description: >-
|
||||
p95 search latency is {{ "{{ $value | humanizeDuration }}" }},
|
||||
exceeding the 2s threshold.
|
||||
|
||||
- alert: MiroirTaskStuck
|
||||
expr: miroir_task_processing_age_seconds > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir task stuck in processing"
|
||||
description: >-
|
||||
Task {{ "{{ $labels.task_type }}" }} has been processing for
|
||||
{{ "{{ $value | humanizeDuration }}" }}.
|
||||
|
||||
- alert: MiroirRebalanceStuck
|
||||
expr: miroir_rebalance_in_progress == 1
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir rebalance has been running for over 2 hours"
|
||||
description: >-
|
||||
A shard rebalance has been in progress for {{ "{{ $value }}" }}.
|
||||
This usually indicates a stuck migration.
|
||||
|
||||
- alert: MiroirSettingsDivergence
|
||||
expr: count(count by (setting) (miroir_node_setting_value)) by (setting) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir settings divergence detected"
|
||||
description: >-
|
||||
Setting {{ "{{ $labels.setting }}" }} has divergent values across nodes.
|
||||
Self-healing (§13.5) should auto-repair; alert fires when it hasn't.
|
||||
|
||||
- alert: MiroirAntientropyMismatch
|
||||
expr: increase(miroir_antientropy_mismatch_total[6h]) >= 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir anti-entropy found persistent mismatches"
|
||||
description: >-
|
||||
Anti-entropy repair has detected mismatches on
|
||||
{{ "{{ $value }}" }} consecutive passes (≈18h default schedule).
|
||||
Self-healing (§13.8) failed to close the gap.
|
||||
|
||||
- name: miroir.resource_pressure
|
||||
rules:
|
||||
- alert: MiroirMemoryPressure
|
||||
expr: miroir_memory_pressure >= 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir memory pressure is elevated"
|
||||
description: >-
|
||||
Memory pressure level {{ "{{ $value }}" }} (>=2 means moderate or higher)
|
||||
sustained for 5 minutes.
|
||||
|
||||
- alert: MiroirRequestQueueBacklog
|
||||
expr: miroir_request_queue_depth > 500
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir request queue backlog is high"
|
||||
description: >-
|
||||
Request queue depth is {{ "{{ $value }}" }}, exceeding the 500 threshold.
|
||||
|
||||
- alert: MiroirBackgroundJobBacklog
|
||||
expr: miroir_background_queue_depth > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir background job queue backlog is high"
|
||||
description: >-
|
||||
Background job queue depth is {{ "{{ $value }}" }},
|
||||
exceeding the 100 threshold for 10 minutes.
|
||||
|
||||
- alert: MiroirPeerDiscoveryGap
|
||||
expr: count(miroir_peer_known) by (namespace, job) != count(miroir_node_healthy == 1) by (namespace, job)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Miroir peer discovery is out of sync"
|
||||
description: >-
|
||||
Number of discovered peers does not match number of healthy nodes.
|
||||
|
||||
- alert: MiroirNoLeader
|
||||
expr: sum(miroir_leader) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Miroir has no elected leader"
|
||||
description: >-
|
||||
No miroir instance holds the leader lease.
|
||||
Cluster coordination is stalled.
|
||||
{{- end }}
|
||||
30
charts/miroir/templates/miroir-service.yaml
Normal file
30
charts/miroir/templates/miroir-service.yaml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
{{/*
|
||||
Miroir Service
|
||||
*/}}
|
||||
{{- if .Values.miroir.replicas }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "miroir.fullname" . }}
|
||||
labels:
|
||||
{{- include "miroir.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: miroir
|
||||
{{- with .Values.service.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.service.type | default "ClusterIP" }}
|
||||
ports:
|
||||
- port: {{ .Values.service.ports.http | default 7700 }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
- port: {{ .Values.service.ports.metrics | default 9090 }}
|
||||
targetPort: metrics
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
selector:
|
||||
{{- include "miroir.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: miroir
|
||||
{{- end }}
|
||||
28
charts/miroir/templates/miroir-servicemonitor.yaml
Normal file
28
charts/miroir/templates/miroir-servicemonitor.yaml
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
{{/*
|
||||
Miroir ServiceMonitor (requires prometheus-operator)
|
||||
*/}}
|
||||
{{- if .Values.serviceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ include "miroir.fullname" . }}
|
||||
labels:
|
||||
{{- include "miroir.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: metrics
|
||||
{{- with .Values.serviceMonitor.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "miroir.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: miroir
|
||||
endpoints:
|
||||
- port: metrics
|
||||
interval: {{ .Values.serviceMonitor.interval | default "30s" }}
|
||||
path: /metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ .Release.Namespace }}
|
||||
{{- end }}
|
||||
|
|
@ -10,7 +10,8 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"repository": { "type": "string" },
|
||||
"tag": { "type": "string" }
|
||||
"tag": { "type": "string" },
|
||||
"pullPolicy": { "type": "string", "enum": ["Always", "IfNotPresent", "Never"] }
|
||||
}
|
||||
},
|
||||
"replicas": {
|
||||
|
|
@ -29,7 +30,19 @@
|
|||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"existingSecret": { "type": "string" }
|
||||
"existingSecret": { "type": "string" },
|
||||
"podAnnotations": { "type": "object" },
|
||||
"podLabels": { "type": "object" },
|
||||
"resources": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limits": { "type": "object" },
|
||||
"requests": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"nodeSelector": { "type": "object" },
|
||||
"tolerations": { "type": "array" },
|
||||
"affinity": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"taskStore": {
|
||||
|
|
@ -43,6 +56,145 @@
|
|||
"url": { "type": "string" }
|
||||
},
|
||||
"required": ["backend"]
|
||||
},
|
||||
"hpa": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"minReplicas": { "type": "integer", "minimum": 1 },
|
||||
"maxReplicas": { "type": "integer", "minimum": 1 },
|
||||
"targetCPUUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
|
||||
"targetMemoryUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 },
|
||||
"behavior": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"tracing": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"endpoint": { "type": "string" },
|
||||
"serviceName": { "type": "string" },
|
||||
"sampleRate": { "type": "number", "minimum": 0, "maximum": 1 }
|
||||
}
|
||||
},
|
||||
"serviceAccount": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"create": { "type": "boolean" },
|
||||
"name": { "type": "string" },
|
||||
"annotations": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"service": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": { "type": "string" },
|
||||
"annotations": { "type": "object" },
|
||||
"ports": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"http": { "type": "integer", "minimum": 1, "maximum": 65535 },
|
||||
"metrics": { "type": "integer", "minimum": 1, "maximum": 65535 }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"serviceMonitor": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"interval": { "type": "string" },
|
||||
"annotations": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"prometheusRule": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"annotations": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"headless": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"annotations": { "type": "object" }
|
||||
}
|
||||
},
|
||||
"meilisearch": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"image": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"repository": { "type": "string" },
|
||||
"tag": { "type": "string" },
|
||||
"pullPolicy": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"replicas": { "type": "integer", "minimum": 1 },
|
||||
"nodesPerGroup": { "type": "integer", "minimum": 1 },
|
||||
"podAnnotations": { "type": "object" },
|
||||
"podLabels": { "type": "object" },
|
||||
"resources": { "type": "object" },
|
||||
"nodeSelector": { "type": "object" },
|
||||
"tolerations": { "type": "array" },
|
||||
"affinity": { "type": "object" },
|
||||
"persistence": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"size": { "type": "string" },
|
||||
"storageClass": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"env": { "type": "array" },
|
||||
"masterKey": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"redis": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"image": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"repository": { "type": "string" },
|
||||
"tag": { "type": "string" },
|
||||
"pullPolicy": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"replicas": { "type": "integer", "minimum": 1 },
|
||||
"podAnnotations": { "type": "object" },
|
||||
"podLabels": { "type": "object" },
|
||||
"resources": { "type": "object" },
|
||||
"nodeSelector": { "type": "object" },
|
||||
"tolerations": { "type": "array" },
|
||||
"affinity": { "type": "object" },
|
||||
"persistence": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"size": { "type": "string" },
|
||||
"storageClass": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"service": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": { "type": "string" },
|
||||
"port": { "type": "integer", "minimum": 1, "maximum": 65535 }
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"cdcPvc": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": { "type": "boolean" },
|
||||
"size": { "type": "string" },
|
||||
"storageClass": { "type": "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"if": {
|
||||
|
|
|
|||
|
|
@ -1,16 +1,153 @@
|
|||
# Miroir Helm Chart Values
|
||||
# These defaults boot a working single-pod install for evaluation and CI.
|
||||
# For production, override to: replicas=2+, replicationFactor=2, replicaGroups=2,
|
||||
# taskStore.backend=redis, redis.enabled=true, hpa.enabled=true
|
||||
|
||||
miroir:
|
||||
image:
|
||||
repository: ghcr.io/jedarden/miroir
|
||||
tag: "" # defaults to Chart.appVersion
|
||||
pullPolicy: IfNotPresent
|
||||
replicas: 1 # dev default: override to 2+ in production (requires taskStore.backend=redis)
|
||||
shards: 64
|
||||
replicationFactor: 1 # dev default: override to 2 in production
|
||||
replicaGroups: 1 # dev default: override to 2 in production
|
||||
existingSecret: "" # name of K8s Secret with masterKey, nodeMasterKey, adminApiKey
|
||||
podAnnotations: {}
|
||||
podLabels: {}
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
|
||||
taskStore:
|
||||
backend: sqlite # sqlite | redis
|
||||
path: /data/miroir-tasks.db
|
||||
url: "" # for redis: redis://host:6379
|
||||
|
||||
# Horizontal Pod Autoscaler (disabled by default for dev)
|
||||
hpa:
|
||||
enabled: false
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
behavior:
|
||||
scaleDown:
|
||||
stabilizationWindowSeconds: 300
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 50
|
||||
periodSeconds: 60
|
||||
scaleUp:
|
||||
stabilizationWindowSeconds: 0
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 100
|
||||
periodSeconds: 30
|
||||
- type: Pods
|
||||
value: 2
|
||||
periodSeconds: 60
|
||||
selectPolicy: Max
|
||||
|
||||
# ServiceAccount
|
||||
serviceAccount:
|
||||
create: true
|
||||
name: "" # defaults to release name
|
||||
annotations: {}
|
||||
|
||||
# Services
|
||||
service:
|
||||
type: ClusterIP
|
||||
annotations: {}
|
||||
ports:
|
||||
http: 7700
|
||||
metrics: 9090
|
||||
|
||||
headless:
|
||||
annotations: {}
|
||||
|
||||
# Meilisearch StatefulSet
|
||||
meilisearch:
|
||||
enabled: true
|
||||
image:
|
||||
repository: getmeilisearch/meilisearch
|
||||
tag: v1.12
|
||||
pullPolicy: IfNotPresent
|
||||
replicas: 2 # 1 group × 2 nodes (dev default)
|
||||
nodesPerGroup: 2 # nodes per replica group
|
||||
podAnnotations: {}
|
||||
podLabels: {}
|
||||
resources:
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: "" # uses default storage class
|
||||
env: []
|
||||
masterKey: "" # defaults to auto-generated
|
||||
|
||||
# Redis deployment (only when taskStore.backend=redis)
|
||||
redis:
|
||||
enabled: false # dev default: enable for production
|
||||
image:
|
||||
repository: redis
|
||||
tag: 7-alpine
|
||||
pullPolicy: IfNotPresent
|
||||
replicas: 1
|
||||
podAnnotations: {}
|
||||
podLabels: {}
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 1Gi
|
||||
storageClass: ""
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 6379
|
||||
|
||||
# PVC for CDC buffer (when cdc.buffer.primary=pvc or overflow=pvc)
|
||||
cdcPvc:
|
||||
enabled: false
|
||||
size: 5Gi
|
||||
storageClass: ""
|
||||
|
||||
# Prometheus Operator integration (plan §10 + §14.9)
|
||||
serviceMonitor:
|
||||
enabled: false # requires prometheus-operator in cluster
|
||||
interval: 30s
|
||||
annotations: {}
|
||||
|
||||
prometheusRule:
|
||||
enabled: false # requires prometheus-operator in cluster
|
||||
annotations: {}
|
||||
|
||||
# OpenTelemetry tracing (plan §10)
|
||||
tracing:
|
||||
enabled: false # disabled by default for zero overhead
|
||||
endpoint: "http://tempo.monitoring.svc:4317" # OTLP gRPC endpoint
|
||||
serviceName: "miroir" # service name for trace identification
|
||||
sampleRate: 0.1 # head-based sampling: 0.1 = ~10% of requests traced
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue