devops/infra/modules/monitoring/monitoring-values.yaml.tftpl

135 lines
3.1 KiB
Plaintext

additionalPrometheusRulesMap:
custom-app-rules:
groups:
- name: aspnetcore
interval: 5m
rules:
- alert: HighRequestLatency
expr: histogram_quantile(0.95, sum by (job, instance) (rate(http_request_duration_seconds_bucket[5m]))) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "95th percentile latency is above 500ms (current value: {{ $value }}s)"
- alert: HighErrorRate
expr: 'rate(http_requests_total{status=~"5.."}[5m]) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate on {{ $labels.instance }}"
description: "Error rate is above 5% (current value: {{ $value }})"
prometheus:
prometheusSpec:
retention: 24h
retentionSize: 10GB
resources:
requests:
memory: 200Mi
cpu: 100m
limits:
memory: 500Mi
cpu: 500m
# Remote write to VictoriaMetrics
remoteWrite:
- url: ${remote_write_url}
queueConfig:
maxSamplesPerSend: 10000
maxShards: 5
minShards: 1
batchSendDeadline: 5s
basicAuth:
username:
name: prometheus-remote-write-auth
key: username
password:
name: prometheus-remote-write-auth
key: password
writeRelabelConfigs:
- sourceLabels: ["__name__"]
regex: "(up|kube_.*|container_.*|node_.*|http_.*|process_.*)"
action: keep
# Remote read from VictoriaMetrics for old data
remoteRead:
- url: ${remote_read_url}
basicAuth:
username:
name: prometheus-remote-write-auth
key: username
password:
name: prometheus-remote-write-auth
key: password
readRecent: false # Only read data older than local retention
alertmanager:
enabled: true
alertmanagerSpec:
replicas: 1
resources:
requests:
memory: 50Mi
cpu: 10m
limits:
memory: 150Mi
cpu: 100m
retention: 24h
grafana:
resources:
requests:
memory: 100Mi
cpu: 50m
limits:
memory: 300Mi
cpu: 200m
persistence:
enabled: true
size: 1Gi
adminUser: admin
adminPassword: ${grafana_admin_password}
kubeStateMetrics:
resources:
requests:
memory: 50Mi
cpu: 10m
limits:
memory: 150Mi
cpu: 100m
nodeExporter:
resources:
requests:
memory: 30Mi
cpu: 10m
limits:
memory: 100Mi
cpu: 100m
prometheusOperator:
resources:
requests:
memory: 100Mi
cpu: 50m
limits:
memory: 300Mi
cpu: 200m
defaultRules:
create: true
rules:
alertmanager: true
etcd: false
general: true
k8s: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
node: true
prometheus: true