diff --git a/kubernetes/apps/observability/kube-prometheus-stack/alertmanagerconfig.yaml b/kubernetes/apps/observability/kube-prometheus-stack/alertmanagerconfig.yaml new file mode 100644 index 00000000..7fb5f9a9 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/alertmanagerconfig.yaml @@ -0,0 +1,85 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/alertmanagerconfig_v1alpha1.json +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: alertmanager +spec: + route: + groupBy: ["alertname", "job"] + groupInterval: 10m + groupWait: 1m + receiver: pushover + repeatInterval: 12h + routes: + - receiver: "null" + matchers: + - name: alertname + value: InfoInhibitor + matchType: = + - receiver: heartbeat + groupInterval: 5m + groupWait: 0s + repeatInterval: 5m + matchers: + - name: alertname + value: Watchdog + matchType: = + - receiver: email + matchers: + - name: severity + value: critical + matchType: = + inhibitRules: + - equal: ["alertname", "namespace"] + sourceMatch: + - name: severity + value: critical + matchType: = + targetMatch: + - name: severity + value: warning + matchType: = + receivers: + - name: "null" + - name: heartbeat + webhookConfigs: + - urlSecret: + name: &secret alertmanager-secret + key: ALERTMANAGER_HEARTBEAT_URL + - name: email + emailConfigs: + # Whether to notify about resolved alerts. + - sendResolved: true + to: 'alerts@${SECRET_DOMAIN}' + from: 'alertmanager@${SECRET_DOMAIN}' + hello: k8s@${SECRET_DOMAIN} + # The smarthost and SMTP sender used for mail notifications. + smarthost: ${ALERTMANAGER_SMTP_HOST} + authUsername: ${ALERTMANAGER_SMTP_USERNAME} + authPassword: + key: *secret + name: ALERTMANAGER_SMTP_PASSWORD + text: >- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] + {{ .CommonLabels.alertname }} + html: |- + {{- range .Alerts }} + {{- if ne .Annotations.description "" }} + {{ .Annotations.description }} + {{- else if ne .Annotations.summary "" }} + {{ .Annotations.summary }} + {{- else if ne .Annotations.message "" }} + {{ .Annotations.message }} + {{- else }} + Alert description not available + {{- end }} + {{- if gt (len .Labels.SortedPairs) 0 }} + + {{- range .Labels.SortedPairs }} + {{ .Name }}: {{ .Value }} + {{- end }} + + {{- end }} + {{- end }} + diff --git a/kubernetes/apps/observability/kube-prometheus-stack/externalsecret.yaml b/kubernetes/apps/observability/kube-prometheus-stack/externalsecret.yaml new file mode 100644 index 00000000..b9daed03 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/externalsecret.yaml @@ -0,0 +1,20 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: alertmanager +spec: + refreshInterval: 5m + secretStoreRef: + kind: ClusterSecretStore + name: onepassword + target: + name: alertmanager-secret + template: + data: + ALERTMANAGER_HEARTBEAT_URL: "{{ .ALERTMANAGER_HEARTBEAT_URL }}" + ALERTMANAGER_SMTP_PASSWORD: "{{ .ALERTMANAGER_SMTP_PASSWORD }}" + dataFrom: + - extract: + key: alertmanager diff --git a/kubernetes/apps/observability/kube-prometheus-stack/helmrelease.yaml b/kubernetes/apps/observability/kube-prometheus-stack/helmrelease.yaml new file mode 100644 index 00000000..3d5174e6 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/helmrelease.yaml @@ -0,0 +1,148 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kube-prometheus-stack +spec: + interval: 30m + chart: + spec: + chart: kube-prometheus-stack + version: 68.2.1 + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + install: + crds: Skip + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + crds: Skip + remediation: + strategy: rollback + retries: 3 + dependsOn: + - name: kube-prometheus-stack-crds + namespace: observability + values: + crds: + enabled: false + cleanPrometheusOperatorObjectNames: true + alertmanager: + ingress: + enabled: true + ingressClassName: internal + hosts: ["alertmanager.${SECRET_DOMAIN}"] + pathType: Prefix + alertmanagerSpec: + alertmanagerConfiguration: + name: alertmanager + global: + resolveTimeout: 5m + externalUrl: https://alertmanager.${SECRET_DOMAIN} + storage: + volumeClaimTemplate: + spec: + storageClassName: nfs-provision + resources: + requests: + storage: 1Gi + kubeApiServer: + serviceMonitor: + selector: + k8s-app: kube-apiserver + kubeScheduler: + service: + selector: + k8s-app: kube-scheduler + kubeControllerManager: &kubeControllerManager + service: + selector: + k8s-app: kube-controller-manager + kubeEtcd: + <<: *kubeControllerManager # etcd runs on control plane nodes + kubeProxy: + enabled: false + prometheus: + ingress: + enabled: true + ingressClassName: internal + hosts: ["prometheus.${SECRET_DOMAIN}"] + pathType: Prefix + prometheusSpec: + podMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + scrapeConfigSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + enableAdminAPI: true + walCompression: true + enableFeatures: + - memory-snapshot-on-shutdown + retention: 14d + retentionSize: 50GB + resources: + requests: + cpu: 100m + limits: + memory: 2000Mi + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: nfs-provision + resources: + requests: + storage: 50Gi + prometheus-node-exporter: + fullnameOverride: node-exporter + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: ["__meta_kubernetes_pod_node_name"] + targetLabel: kubernetes_node + kube-state-metrics: + fullnameOverride: kube-state-metrics + metricLabelsAllowlist: + - pods=[*] + - deployments=[*] + - persistentvolumeclaims=[*] + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: ["__meta_kubernetes_pod_node_name"] + targetLabel: kubernetes_node + grafana: + enabled: false + forceDeployDashboards: true + additionalPrometheusRulesMap: + dockerhub-rules: + groups: + - name: dockerhub + rules: + - alert: DockerhubRateLimitRisk + annotations: + summary: Kubernetes cluster Dockerhub rate limit risk + expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100 + labels: + severity: critical + oom-rules: + groups: + - name: oom + rules: + - alert: OomKilled + annotations: + summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + labels: + severity: critical diff --git a/kubernetes/apps/observability/kube-prometheus-stack/kustomization.yaml b/kubernetes/apps/observability/kube-prometheus-stack/kustomization.yaml new file mode 100644 index 00000000..d132dc09 --- /dev/null +++ b/kubernetes/apps/observability/kube-prometheus-stack/kustomization.yaml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./alertmanagerconfig.yaml + - ./externalsecret.yaml + - ./helmrelease.yaml diff --git a/kubernetes/flux/meta/settings/cluster-secrets.sops.yaml b/kubernetes/flux/meta/settings/cluster-secrets.sops.yaml index 11da95b1..f17e41e0 100644 --- a/kubernetes/flux/meta/settings/cluster-secrets.sops.yaml +++ b/kubernetes/flux/meta/settings/cluster-secrets.sops.yaml @@ -14,6 +14,9 @@ stringData: CLUSTER_SVC_V6_PREFIX: ENC[AES256_GCM,data:qVmaFX2V2/TF1z9gLij+ZEzblucO,iv:T8UYxEN8r1A7nSqalS7Mxw0Dn8saDKckYFbHP+V38JM=,tag:RJswikeCI4a0xZv4i7Yegg==,type:str] CLUSTER_LBA_V6_CIDR: ENC[AES256_GCM,data:BUDk53jJv3VYKiMPaHwh69omm7QCB8zmksNy,iv:rTkABwkYE84F36OrY1AsUdx1/3EryaCo8in91Vqwuxk=,tag:Ot/288eN/+u4T9Wg8otezA==,type:str] CLUSTER_NODE_V6_CIDR: ENC[AES256_GCM,data:9EtUqN4vA5pzYGPigwqhVc8oMw==,iv:rpnJtQ7E1sW/D7IYxOYtA7TU+cl+tMyAe3oEZ+Kgqks=,tag:n0muTnj1K0dgJgQjcUiuDQ==,type:str] + #ENC[AES256_GCM,data:brFBypll5QOb7yyqt/gHs5rH75+FXbW753m8,iv:wC1nkZUN3nBS+7ZCvGi1K8aYWXG7E0Ywr+H/vZORzAM=,tag:ZN3sr+XSdxDz9zPO+vRZFw==,type:comment] + ALERTMANAGER_SMTP_USERNAME: ENC[AES256_GCM,data:33AiYpDOJ41hHhTgfLsGUglWVk8KwVw=,iv:MBDtmvhgo4urPMHJDRIgPmS5avRg8//5N+YhWeECqtw=,tag:aP6ht3OlN902f9877OaAng==,type:str] + ALERTMANAGER_SMTP_HOST: ENC[AES256_GCM,data:O8rXlZjoe9xwRqXd0Iy7eNS4,iv:oJHPANSEbV2LYf0+z8JQS4kK25gZoOS66STciO4yneI=,tag:7svebzCwZU5JbhTLgrFtiw==,type:str] sops: kms: [] gcp_kms: [] @@ -29,9 +32,9 @@ sops: dEJCQ0VzcEVlWmdDYUs5Nm9jYTVXckkKr8OGj284W6dhf5uUFtpwPX1eaz0dYWx2 uy6dvYEY+SSVSGaojydt8IFU80vhaQIslI2A7hIjNmGY6s5Pl2Zpnw== -----END AGE ENCRYPTED FILE----- - lastmodified: "2025-01-18T21:51:24Z" - mac: ENC[AES256_GCM,data:rmU+URrHaloXPKthGXfStu4T2/2XhL7NYrwK5ZjUkFmFGNCHlihEjpOW+gowPPA2Dhb7I6wVCAT9Ix+2ir4EYi+xx3Q3Zkbc4dh+QzvbgXnFuWQcTgb6l8ePpsNDVVWDz6fRyI/1m+bky67vhqRXmXjJglxnD+ZIEIBOIdI2bsA=,iv:sklvnbvwADKvrr2LlLCtmCLFOPYqSVwFkzsv3xV0mHE=,tag:9B3AxDUa1CMenwatD77pVA==,type:str] + lastmodified: "2025-02-01T13:32:02Z" + mac: ENC[AES256_GCM,data:ww6bzEoYf0i2ChcHXsQzv1j4ijpoO5O/3o5r1urAckvH9UnO5Dg2mDd8wv2ZZSbueibBpgJAV/V+FpUPIyAqaN6m5aLsGHSz/usfhz62fCownI9zv/gnfCbvGNTa19EL5Cnniv6gc5dtUZOlkyMOfmO0Ps++fsgeG1TyF3q+gZM=,iv:eRcpVyQUzr1YeIpurtqklMKv3y3F2Vn+oiiTIddfKWk=,tag:6+cuyd7fLGBGBb4jNqoENg==,type:str] pgp: [] encrypted_regex: ^(data|stringData)$ mac_only_encrypted: true - version: 3.9.3 + version: 3.9.4