Skip to content

Commit

Permalink
add kube prometheus stack (#248)
Browse files Browse the repository at this point in the history
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
Co-authored-by: RonaldPhilipsen <[email protected]>
Co-authored-by: Ronald Philipsen <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
5 people authored Feb 1, 2025
1 parent 0707ce5 commit af0685e
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/alertmanagerconfig_v1alpha1.json
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: alertmanager
spec:
route:
groupBy: ["alertname", "job"]
groupInterval: 10m
groupWait: 1m
receiver: pushover
repeatInterval: 12h
routes:
- receiver: "null"
matchers:
- name: alertname
value: InfoInhibitor
matchType: =
- receiver: heartbeat
groupInterval: 5m
groupWait: 0s
repeatInterval: 5m
matchers:
- name: alertname
value: Watchdog
matchType: =
- receiver: email
matchers:
- name: severity
value: critical
matchType: =
inhibitRules:
- equal: ["alertname", "namespace"]
sourceMatch:
- name: severity
value: critical
matchType: =
targetMatch:
- name: severity
value: warning
matchType: =
receivers:
- name: "null"
- name: heartbeat
webhookConfigs:
- urlSecret:
name: &secret alertmanager-secret
key: ALERTMANAGER_HEARTBEAT_URL
- name: email
emailConfigs:
# Whether to notify about resolved alerts.
- sendResolved: true
to: 'alerts@${SECRET_DOMAIN}'
from: 'alertmanager@${SECRET_DOMAIN}'
hello: k8s@${SECRET_DOMAIN}
# The smarthost and SMTP sender used for mail notifications.
smarthost: ${ALERTMANAGER_SMTP_HOST}
authUsername: ${ALERTMANAGER_SMTP_USERNAME}
authPassword:
key: *secret
name: ALERTMANAGER_SMTP_PASSWORD
text: >-
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ .CommonLabels.alertname }}
html: |-
{{- range .Alerts }}
{{- if ne .Annotations.description "" }}
{{ .Annotations.description }}
{{- else if ne .Annotations.summary "" }}
{{ .Annotations.summary }}
{{- else if ne .Annotations.message "" }}
{{ .Annotations.message }}
{{- else }}
Alert description not available
{{- end }}
{{- if gt (len .Labels.SortedPairs) 0 }}
<small>
{{- range .Labels.SortedPairs }}
<b>{{ .Name }}:</b> {{ .Value }}
{{- end }}
</small>
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: alertmanager
spec:
refreshInterval: 5m
secretStoreRef:
kind: ClusterSecretStore
name: onepassword
target:
name: alertmanager-secret
template:
data:
ALERTMANAGER_HEARTBEAT_URL: "{{ .ALERTMANAGER_HEARTBEAT_URL }}"
ALERTMANAGER_SMTP_PASSWORD: "{{ .ALERTMANAGER_SMTP_PASSWORD }}"
dataFrom:
- extract:
key: alertmanager
148 changes: 148 additions & 0 deletions kubernetes/apps/observability/kube-prometheus-stack/helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-prometheus-stack
spec:
interval: 30m
chart:
spec:
chart: kube-prometheus-stack
version: 68.2.1
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
install:
crds: Skip
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: Skip
remediation:
strategy: rollback
retries: 3
dependsOn:
- name: kube-prometheus-stack-crds
namespace: observability
values:
crds:
enabled: false
cleanPrometheusOperatorObjectNames: true
alertmanager:
ingress:
enabled: true
ingressClassName: internal
hosts: ["alertmanager.${SECRET_DOMAIN}"]
pathType: Prefix
alertmanagerSpec:
alertmanagerConfiguration:
name: alertmanager
global:
resolveTimeout: 5m
externalUrl: https://alertmanager.${SECRET_DOMAIN}
storage:
volumeClaimTemplate:
spec:
storageClassName: nfs-provision
resources:
requests:
storage: 1Gi
kubeApiServer:
serviceMonitor:
selector:
k8s-app: kube-apiserver
kubeScheduler:
service:
selector:
k8s-app: kube-scheduler
kubeControllerManager: &kubeControllerManager
service:
selector:
k8s-app: kube-controller-manager
kubeEtcd:
<<: *kubeControllerManager # etcd runs on control plane nodes
kubeProxy:
enabled: false
prometheus:
ingress:
enabled: true
ingressClassName: internal
hosts: ["prometheus.${SECRET_DOMAIN}"]
pathType: Prefix
prometheusSpec:
podMonitorSelectorNilUsesHelmValues: false
probeSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
scrapeConfigSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
enableAdminAPI: true
walCompression: true
enableFeatures:
- memory-snapshot-on-shutdown
retention: 14d
retentionSize: 50GB
resources:
requests:
cpu: 100m
limits:
memory: 2000Mi
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: nfs-provision
resources:
requests:
storage: 50Gi
prometheus-node-exporter:
fullnameOverride: node-exporter
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: kubernetes_node
kube-state-metrics:
fullnameOverride: kube-state-metrics
metricLabelsAllowlist:
- pods=[*]
- deployments=[*]
- persistentvolumeclaims=[*]
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels: ["__meta_kubernetes_pod_node_name"]
targetLabel: kubernetes_node
grafana:
enabled: false
forceDeployDashboards: true
additionalPrometheusRulesMap:
dockerhub-rules:
groups:
- name: dockerhub
rules:
- alert: DockerhubRateLimitRisk
annotations:
summary: Kubernetes cluster Dockerhub rate limit risk
expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30) > 100
labels:
severity: critical
oom-rules:
groups:
- name: oom
rules:
- alert: OomKilled
annotations:
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
labels:
severity: critical
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./alertmanagerconfig.yaml
- ./externalsecret.yaml
- ./helmrelease.yaml
9 changes: 6 additions & 3 deletions kubernetes/flux/meta/settings/cluster-secrets.sops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ stringData:
CLUSTER_SVC_V6_PREFIX: ENC[AES256_GCM,data:qVmaFX2V2/TF1z9gLij+ZEzblucO,iv:T8UYxEN8r1A7nSqalS7Mxw0Dn8saDKckYFbHP+V38JM=,tag:RJswikeCI4a0xZv4i7Yegg==,type:str]
CLUSTER_LBA_V6_CIDR: ENC[AES256_GCM,data:BUDk53jJv3VYKiMPaHwh69omm7QCB8zmksNy,iv:rTkABwkYE84F36OrY1AsUdx1/3EryaCo8in91Vqwuxk=,tag:Ot/288eN/+u4T9Wg8otezA==,type:str]
CLUSTER_NODE_V6_CIDR: ENC[AES256_GCM,data:9EtUqN4vA5pzYGPigwqhVc8oMw==,iv:rpnJtQ7E1sW/D7IYxOYtA7TU+cl+tMyAe3oEZ+Kgqks=,tag:n0muTnj1K0dgJgQjcUiuDQ==,type:str]
#ENC[AES256_GCM,data:brFBypll5QOb7yyqt/gHs5rH75+FXbW753m8,iv:wC1nkZUN3nBS+7ZCvGi1K8aYWXG7E0Ywr+H/vZORzAM=,tag:ZN3sr+XSdxDz9zPO+vRZFw==,type:comment]
ALERTMANAGER_SMTP_USERNAME: ENC[AES256_GCM,data:33AiYpDOJ41hHhTgfLsGUglWVk8KwVw=,iv:MBDtmvhgo4urPMHJDRIgPmS5avRg8//5N+YhWeECqtw=,tag:aP6ht3OlN902f9877OaAng==,type:str]
ALERTMANAGER_SMTP_HOST: ENC[AES256_GCM,data:O8rXlZjoe9xwRqXd0Iy7eNS4,iv:oJHPANSEbV2LYf0+z8JQS4kK25gZoOS66STciO4yneI=,tag:7svebzCwZU5JbhTLgrFtiw==,type:str]
sops:
kms: []
gcp_kms: []
Expand All @@ -29,9 +32,9 @@ sops:
dEJCQ0VzcEVlWmdDYUs5Nm9jYTVXckkKr8OGj284W6dhf5uUFtpwPX1eaz0dYWx2
uy6dvYEY+SSVSGaojydt8IFU80vhaQIslI2A7hIjNmGY6s5Pl2Zpnw==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2025-01-18T21:51:24Z"
mac: ENC[AES256_GCM,data:rmU+URrHaloXPKthGXfStu4T2/2XhL7NYrwK5ZjUkFmFGNCHlihEjpOW+gowPPA2Dhb7I6wVCAT9Ix+2ir4EYi+xx3Q3Zkbc4dh+QzvbgXnFuWQcTgb6l8ePpsNDVVWDz6fRyI/1m+bky67vhqRXmXjJglxnD+ZIEIBOIdI2bsA=,iv:sklvnbvwADKvrr2LlLCtmCLFOPYqSVwFkzsv3xV0mHE=,tag:9B3AxDUa1CMenwatD77pVA==,type:str]
lastmodified: "2025-02-01T13:32:02Z"
mac: ENC[AES256_GCM,data:ww6bzEoYf0i2ChcHXsQzv1j4ijpoO5O/3o5r1urAckvH9UnO5Dg2mDd8wv2ZZSbueibBpgJAV/V+FpUPIyAqaN6m5aLsGHSz/usfhz62fCownI9zv/gnfCbvGNTa19EL5Cnniv6gc5dtUZOlkyMOfmO0Ps++fsgeG1TyF3q+gZM=,iv:eRcpVyQUzr1YeIpurtqklMKv3y3F2Vn+oiiTIddfKWk=,tag:6+cuyd7fLGBGBb4jNqoENg==,type:str]
pgp: []
encrypted_regex: ^(data|stringData)$
mac_only_encrypted: true
version: 3.9.3
version: 3.9.4

0 comments on commit af0685e

Please sign in to comment.