diff --git a/charts/miroir/templates/miroir-prometheusrule.yaml b/charts/miroir/templates/miroir-prometheusrule.yaml new file mode 100644 index 0000000..7357ed5 --- /dev/null +++ b/charts/miroir/templates/miroir-prometheusrule.yaml @@ -0,0 +1,152 @@ +{{/* +Miroir PrometheusRule — alerting rules (plan §10 + §14.9) +*/}} +{{- if .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "miroir.fullname" . }} + labels: + {{- include "miroir.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics + {{- with .Values.prometheusRule.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: miroir.availability + rules: + - alert: MiroirDegradedShards + expr: miroir_degraded_shards_total > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Miroir has degraded shards" + description: >- + {{ include "miroir.fullname" . }} has {{ "{{ $value }}" }} degraded shard(s). + Self-healing should resolve this; alert fires when healing failed. + + - alert: MiroirNodeDown + expr: miroir_node_healthy == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Miroir node is down" + description: >- + Node {{ "{{ $labels.node_id }}" }} in replica group + {{ "{{ $labels.replica_group }}" }} has been unhealthy for 5 minutes. + + - alert: MiroirHighSearchLatency + expr: histogram_quantile(0.95, sum(rate(miroir_search_duration_seconds_bucket[5m])) by (le)) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Miroir search latency is high" + description: >- + p95 search latency is {{ "{{ $value | humanizeDuration }}" }}, + exceeding the 2s threshold. + + - alert: MiroirTaskStuck + expr: miroir_task_processing_age_seconds > 3600 + for: 10m + labels: + severity: warning + annotations: + summary: "Miroir task stuck in processing" + description: >- + Task {{ "{{ $labels.task_type }}" }} has been processing for + {{ "{{ $value | humanizeDuration }}" }}. + + - alert: MiroirRebalanceStuck + expr: miroir_rebalance_in_progress == 1 + for: 2h + labels: + severity: warning + annotations: + summary: "Miroir rebalance has been running for over 2 hours" + description: >- + A shard rebalance has been in progress for {{ "{{ $value }}" }}. + This usually indicates a stuck migration. + + - alert: MiroirSettingsDivergence + expr: count(count by (setting) (miroir_node_setting_value)) by (setting) > 1 + for: 15m + labels: + severity: warning + annotations: + summary: "Miroir settings divergence detected" + description: >- + Setting {{ "{{ $labels.setting }}" }} has divergent values across nodes. + Self-healing (§13.5) should auto-repair; alert fires when it hasn't. + + - alert: MiroirAntientropyMismatch + expr: increase(miroir_antientropy_mismatch_total[6h]) >= 3 + for: 0m + labels: + severity: warning + annotations: + summary: "Miroir anti-entropy found persistent mismatches" + description: >- + Anti-entropy repair has detected mismatches on + {{ "{{ $value }}" }} consecutive passes (≈18h default schedule). + Self-healing (§13.8) failed to close the gap. + + - name: miroir.resource_pressure + rules: + - alert: MiroirMemoryPressure + expr: miroir_memory_pressure >= 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Miroir memory pressure is elevated" + description: >- + Memory pressure level {{ "{{ $value }}" }} (>=2 means moderate or higher) + sustained for 5 minutes. + + - alert: MiroirRequestQueueBacklog + expr: miroir_request_queue_depth > 500 + for: 2m + labels: + severity: warning + annotations: + summary: "Miroir request queue backlog is high" + description: >- + Request queue depth is {{ "{{ $value }}" }}, exceeding the 500 threshold. + + - alert: MiroirBackgroundJobBacklog + expr: miroir_background_queue_depth > 100 + for: 10m + labels: + severity: warning + annotations: + summary: "Miroir background job queue backlog is high" + description: >- + Background job queue depth is {{ "{{ $value }}" }}, + exceeding the 100 threshold for 10 minutes. + + - alert: MiroirPeerDiscoveryGap + expr: count(miroir_peer_known) by (namespace, job) != count(miroir_node_healthy == 1) by (namespace, job) + for: 2m + labels: + severity: warning + annotations: + summary: "Miroir peer discovery is out of sync" + description: >- + Number of discovered peers does not match number of healthy nodes. + + - alert: MiroirNoLeader + expr: sum(miroir_leader) == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Miroir has no elected leader" + description: >- + No miroir instance holds the leader lease. + Cluster coordination is stalled. +{{- end }} diff --git a/charts/miroir/templates/miroir-service.yaml b/charts/miroir/templates/miroir-service.yaml new file mode 100644 index 0000000..aff3072 --- /dev/null +++ b/charts/miroir/templates/miroir-service.yaml @@ -0,0 +1,30 @@ +{{/* +Miroir Service +*/}} +{{- if .Values.miroir.replicas }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "miroir.fullname" . }} + labels: + {{- include "miroir.labels" . | nindent 4 }} + app.kubernetes.io/component: miroir + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type | default "ClusterIP" }} + ports: + - port: {{ .Values.service.ports.http | default 7700 }} + targetPort: http + protocol: TCP + name: http + - port: {{ .Values.service.ports.metrics | default 9090 }} + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "miroir.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: miroir +{{- end }} diff --git a/charts/miroir/templates/miroir-servicemonitor.yaml b/charts/miroir/templates/miroir-servicemonitor.yaml new file mode 100644 index 0000000..45d3817 --- /dev/null +++ b/charts/miroir/templates/miroir-servicemonitor.yaml @@ -0,0 +1,28 @@ +{{/* +Miroir ServiceMonitor (requires prometheus-operator) +*/}} +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "miroir.fullname" . }} + labels: + {{- include "miroir.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics + {{- with .Values.serviceMonitor.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "miroir.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: miroir + endpoints: + - port: metrics + interval: {{ .Values.serviceMonitor.interval | default "30s" }} + path: /metrics + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} +{{- end }} diff --git a/charts/miroir/values.schema.json b/charts/miroir/values.schema.json index bc7d267..fd79990 100644 --- a/charts/miroir/values.schema.json +++ b/charts/miroir/values.schema.json @@ -10,7 +10,8 @@ "type": "object", "properties": { "repository": { "type": "string" }, - "tag": { "type": "string" } + "tag": { "type": "string" }, + "pullPolicy": { "type": "string", "enum": ["Always", "IfNotPresent", "Never"] } } }, "replicas": { @@ -29,7 +30,19 @@ "type": "integer", "minimum": 1 }, - "existingSecret": { "type": "string" } + "existingSecret": { "type": "string" }, + "podAnnotations": { "type": "object" }, + "podLabels": { "type": "object" }, + "resources": { + "type": "object", + "properties": { + "limits": { "type": "object" }, + "requests": { "type": "object" } + } + }, + "nodeSelector": { "type": "object" }, + "tolerations": { "type": "array" }, + "affinity": { "type": "object" } } }, "taskStore": { @@ -43,6 +56,145 @@ "url": { "type": "string" } }, "required": ["backend"] + }, + "hpa": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "minReplicas": { "type": "integer", "minimum": 1 }, + "maxReplicas": { "type": "integer", "minimum": 1 }, + "targetCPUUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 }, + "targetMemoryUtilizationPercentage": { "type": "integer", "minimum": 1, "maximum": 100 }, + "behavior": { "type": "object" } + } + }, + "tracing": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "endpoint": { "type": "string" }, + "serviceName": { "type": "string" }, + "sampleRate": { "type": "number", "minimum": 0, "maximum": 1 } + } + }, + "serviceAccount": { + "type": "object", + "properties": { + "create": { "type": "boolean" }, + "name": { "type": "string" }, + "annotations": { "type": "object" } + } + }, + "service": { + "type": "object", + "properties": { + "type": { "type": "string" }, + "annotations": { "type": "object" }, + "ports": { + "type": "object", + "properties": { + "http": { "type": "integer", "minimum": 1, "maximum": 65535 }, + "metrics": { "type": "integer", "minimum": 1, "maximum": 65535 } + } + } + } + }, + "serviceMonitor": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "interval": { "type": "string" }, + "annotations": { "type": "object" } + } + }, + "prometheusRule": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "annotations": { "type": "object" } + } + }, + "headless": { + "type": "object", + "properties": { + "annotations": { "type": "object" } + } + }, + "meilisearch": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "image": { + "type": "object", + "properties": { + "repository": { "type": "string" }, + "tag": { "type": "string" }, + "pullPolicy": { "type": "string" } + } + }, + "replicas": { "type": "integer", "minimum": 1 }, + "nodesPerGroup": { "type": "integer", "minimum": 1 }, + "podAnnotations": { "type": "object" }, + "podLabels": { "type": "object" }, + "resources": { "type": "object" }, + "nodeSelector": { "type": "object" }, + "tolerations": { "type": "array" }, + "affinity": { "type": "object" }, + "persistence": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "size": { "type": "string" }, + "storageClass": { "type": "string" } + } + }, + "env": { "type": "array" }, + "masterKey": { "type": "string" } + } + }, + "redis": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "image": { + "type": "object", + "properties": { + "repository": { "type": "string" }, + "tag": { "type": "string" }, + "pullPolicy": { "type": "string" } + } + }, + "replicas": { "type": "integer", "minimum": 1 }, + "podAnnotations": { "type": "object" }, + "podLabels": { "type": "object" }, + "resources": { "type": "object" }, + "nodeSelector": { "type": "object" }, + "tolerations": { "type": "array" }, + "affinity": { "type": "object" }, + "persistence": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "size": { "type": "string" }, + "storageClass": { "type": "string" } + } + }, + "service": { + "type": "object", + "properties": { + "type": { "type": "string" }, + "port": { "type": "integer", "minimum": 1, "maximum": 65535 } + } + } + } + }, + "cdcPvc": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "size": { "type": "string" }, + "storageClass": { "type": "string" } + } } }, "if": { diff --git a/charts/miroir/values.yaml b/charts/miroir/values.yaml index 61e38a8..c776084 100644 --- a/charts/miroir/values.yaml +++ b/charts/miroir/values.yaml @@ -1,16 +1,153 @@ # Miroir Helm Chart Values +# These defaults boot a working single-pod install for evaluation and CI. +# For production, override to: replicas=2+, replicationFactor=2, replicaGroups=2, +# taskStore.backend=redis, redis.enabled=true, hpa.enabled=true miroir: image: repository: ghcr.io/jedarden/miroir tag: "" # defaults to Chart.appVersion + pullPolicy: IfNotPresent replicas: 1 # dev default: override to 2+ in production (requires taskStore.backend=redis) shards: 64 replicationFactor: 1 # dev default: override to 2 in production replicaGroups: 1 # dev default: override to 2 in production existingSecret: "" # name of K8s Secret with masterKey, nodeMasterKey, adminApiKey + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 250m + memory: 512Mi + nodeSelector: {} + tolerations: [] + affinity: {} taskStore: backend: sqlite # sqlite | redis path: /data/miroir-tasks.db url: "" # for redis: redis://host:6379 + +# Horizontal Pod Autoscaler (disabled by default for dev) +hpa: + enabled: false + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 100 + periodSeconds: 30 + - type: Pods + value: 2 + periodSeconds: 60 + selectPolicy: Max + +# ServiceAccount +serviceAccount: + create: true + name: "" # defaults to release name + annotations: {} + +# Services +service: + type: ClusterIP + annotations: {} + ports: + http: 7700 + metrics: 9090 + +headless: + annotations: {} + +# Meilisearch StatefulSet +meilisearch: + enabled: true + image: + repository: getmeilisearch/meilisearch + tag: v1.12 + pullPolicy: IfNotPresent + replicas: 2 # 1 group × 2 nodes (dev default) + nodesPerGroup: 2 # nodes per replica group + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 2000m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi + nodeSelector: {} + tolerations: [] + affinity: {} + persistence: + enabled: true + size: 10Gi + storageClass: "" # uses default storage class + env: [] + masterKey: "" # defaults to auto-generated + +# Redis deployment (only when taskStore.backend=redis) +redis: + enabled: false # dev default: enable for production + image: + repository: redis + tag: 7-alpine + pullPolicy: IfNotPresent + replicas: 1 + podAnnotations: {} + podLabels: {} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + nodeSelector: {} + tolerations: [] + affinity: {} + persistence: + enabled: true + size: 1Gi + storageClass: "" + service: + type: ClusterIP + port: 6379 + +# PVC for CDC buffer (when cdc.buffer.primary=pvc or overflow=pvc) +cdcPvc: + enabled: false + size: 5Gi + storageClass: "" + +# Prometheus Operator integration (plan §10 + §14.9) +serviceMonitor: + enabled: false # requires prometheus-operator in cluster + interval: 30s + annotations: {} + +prometheusRule: + enabled: false # requires prometheus-operator in cluster + annotations: {} + +# OpenTelemetry tracing (plan §10) +tracing: + enabled: false # disabled by default for zero overhead + endpoint: "http://tempo.monitoring.svc:4317" # OTLP gRPC endpoint + serviceName: "miroir" # service name for trace identification + sampleRate: 0.1 # head-based sampling: 0.1 = ~10% of requests traced