Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable kube-prometheus stack #250

Merged
merged 2 commits into from
Feb 1, 2025
Merged

enable kube-prometheus stack #250

merged 2 commits into from
Feb 1, 2025

Conversation

RonaldPhilipsen
Copy link
Owner

No description provided.

Copy link
Contributor

github-actions bot commented Feb 1, 2025

--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: flux-system/kube-prometheus-stack

+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: flux-system/kube-prometheus-stack

@@ -0,0 +1,38 @@

+---
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  labels:
+    kustomize.toolkit.fluxcd.io/name: cluster-apps
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: kube-prometheus-stack
+  namespace: flux-system
+spec:
+  commonMetadata:
+    labels:
+      app.kubernetes.io/name: kube-prometheus-stack
+  decryption:
+    provider: sops
+    secretRef:
+      name: sops-age
+  dependsOn:
+  - name: onepassword-connect
+  interval: 30m
+  path: ./kubernetes/apps/observability/kube-prometheus-stack/app
+  postBuild:
+    substituteFrom:
+    - kind: ConfigMap
+      name: cluster-settings
+      optional: true
+    - kind: Secret
+      name: cluster-secrets
+      optional: true
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  targetNamespace: observability
+  timeout: 5m
+  wait: false
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack AlertmanagerConfig: observability/alertmanager

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack AlertmanagerConfig: observability/alertmanager

@@ -0,0 +1,90 @@

+---
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: AlertmanagerConfig
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: alertmanager
+  namespace: observability
+spec:
+  inhibitRules:
+  - equal:
+    - alertname
+    - namespace
+    sourceMatch:
+    - matchType: =
+      name: severity
+      value: critical
+    targetMatch:
+    - matchType: =
+      name: severity
+      value: warning
+  receivers:
+  - name: 'null'
+  - name: heartbeat
+    webhookConfigs:
+    - urlSecret:
+        key: ALERTMANAGER_HEARTBEAT_URL
+        name: alertmanager-secret
+  - emailConfigs:
+    - authPassword:
+        key: alertmanager-secret
+        name: ALERTMANAGER_SMTP_PASSWORD
+      authUsername: ..PLACEHOLDER_ALERTMANAGER_SMTP_USERNAME..
+      from: [email protected]_SECRET_DOMAIN..
+      hello: [email protected]_SECRET_DOMAIN..
+      html: |-
+        {{- range .Alerts }}
+          {{- if ne .Annotations.description "" }}
+            {{ .Annotations.description }}
+          {{- else if ne .Annotations.summary "" }}
+            {{ .Annotations.summary }}
+          {{- else if ne .Annotations.message "" }}
+            {{ .Annotations.message }}
+          {{- else }}
+            Alert description not available
+          {{- end }}
+          {{- if gt (len .Labels.SortedPairs) 0 }}
+            <small>
+              {{- range .Labels.SortedPairs }}
+                <b>{{ .Name }}:</b> {{ .Value }}
+              {{- end }}
+            </small>
+          {{- end }}
+        {{- end }}
+      sendResolved: true
+      smarthost: ..PLACEHOLDER_ALERTMANAGER_SMTP_HOST..
+      text: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing
+        | len }}{{ end }}] {{ .CommonLabels.alertname }}'
+      to: [email protected]_SECRET_DOMAIN..
+    name: email
+  route:
+    groupBy:
+    - alertname
+    - job
+    groupInterval: 10m
+    groupWait: 1m
+    receiver: pushover
+    repeatInterval: 12h
+    routes:
+    - matchers:
+      - matchType: =
+        name: alertname
+        value: InfoInhibitor
+      receiver: 'null'
+    - groupInterval: 5m
+      groupWait: 0s
+      matchers:
+      - matchType: =
+        name: alertname
+        value: Watchdog
+      receiver: heartbeat
+      repeatInterval: 5m
+    - matchers:
+      - matchType: =
+        name: severity
+        value: critical
+      receiver: email
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack ExternalSecret: observability/alertmanager

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack ExternalSecret: observability/alertmanager

@@ -0,0 +1,25 @@

+---
+apiVersion: external-secrets.io/v1beta1
+kind: ExternalSecret
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: alertmanager
+  namespace: observability
+spec:
+  dataFrom:
+  - extract:
+      key: alertmanager
+  refreshInterval: 5m
+  secretStoreRef:
+    kind: ClusterSecretStore
+    name: onepassword
+  target:
+    name: alertmanager-secret
+    template:
+      data:
+        ALERTMANAGER_HEARTBEAT_URL: '{{ .ALERTMANAGER_HEARTBEAT_URL }}'
+        ALERTMANAGER_SMTP_PASSWORD: '{{ .ALERTMANAGER_SMTP_PASSWORD }}'
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

@@ -0,0 +1,164 @@

+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: kube-prometheus-stack
+  namespace: observability
+spec:
+  chart:
+    spec:
+      chart: kube-prometheus-stack
+      sourceRef:
+        kind: HelmRepository
+        name: prometheus-community
+        namespace: flux-system
+      version: 68.4.4
+  dependsOn:
+  - name: kube-prometheus-stack-crds
+    namespace: observability
+  install:
+    crds: Skip
+    remediation:
+      retries: 3
+  interval: 30m
+  upgrade:
+    cleanupOnFail: true
+    crds: Skip
+    remediation:
+      retries: 3
+      strategy: rollback
+  values:
+    additionalPrometheusRulesMap:
+      dockerhub-rules:
+        groups:
+        - name: dockerhub
+          rules:
+          - alert: DockerhubRateLimitRisk
+            annotations:
+              summary: Kubernetes cluster Dockerhub rate limit risk
+            expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""}
+              < 30) > 100
+            labels:
+              severity: critical
+      oom-rules:
+        groups:
+        - name: oom
+          rules:
+          - alert: OomKilled
+            annotations:
+              summary: Container {{ $labels.container }} in pod {{ $labels.namespace
+                }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the
+                last 10 minutes.
+            expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
+              offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
+              == 1
+            labels:
+              severity: critical
+    alertmanager:
+      alertmanagerSpec:
+        alertmanagerConfiguration:
+          global:
+            resolveTimeout: 5m
+          name: alertmanager
+        externalUrl: https://alertmanager...PLACEHOLDER_SECRET_DOMAIN..
+        storage:
+          volumeClaimTemplate:
+            spec:
+              resources:
+                requests:
+                  storage: 1Gi
+              storageClassName: nfs-provision
+      ingress:
+        enabled: true
+        hosts:
+        - alertmanager...PLACEHOLDER_SECRET_DOMAIN..
+        ingressClassName: internal
+        pathType: Prefix
+    cleanPrometheusOperatorObjectNames: true
+    crds:
+      enabled: false
+    grafana:
+      enabled: false
+      forceDeployDashboards: true
+    kube-state-metrics:
+      fullnameOverride: kube-state-metrics
+      metricLabelsAllowlist:
+      - pods=[*]
+      - deployments=[*]
+      - persistentvolumeclaims=[*]
+      prometheus:
+        monitor:
+          enabled: true
+          relabelings:
+          - action: replace
+            regex: (.*)
+            replacement: $1
+            sourceLabels:
+            - __meta_kubernetes_pod_node_name
+            targetLabel: kubernetes_node
+    kubeApiServer:
+      serviceMonitor:
+        selector:
+          k8s-app: kube-apiserver
+    kubeControllerManager:
+      service:
+        selector:
+          k8s-app: kube-controller-manager
+    kubeEtcd:
+      service:
+        selector:
+          k8s-app: kube-controller-manager
+    kubeProxy:
+      enabled: false
+    kubeScheduler:
+      service:
+        selector:
+          k8s-app: kube-scheduler
+    prometheus:
+      ingress:
+        enabled: true
+        hosts:
+        - prometheus...PLACEHOLDER_SECRET_DOMAIN..
+        ingressClassName: internal
+        pathType: Prefix
+      prometheusSpec:
+        enableAdminAPI: true
+        enableFeatures:
+        - memory-snapshot-on-shutdown
+        podMonitorSelectorNilUsesHelmValues: false
+        probeSelectorNilUsesHelmValues: false
+        resources:
+          limits:
+            memory: 2000Mi
+          requests:
+            cpu: 100m
+        retention: 14d
+        retentionSize: 50GB
+        ruleSelectorNilUsesHelmValues: false
+        scrapeConfigSelectorNilUsesHelmValues: false
+        serviceMonitorSelectorNilUsesHelmValues: false
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              resources:
+                requests:
+                  storage: 50Gi
+              storageClassName: nfs-provision
+        walCompression: true
+    prometheus-node-exporter:
+      fullnameOverride: node-exporter
+      prometheus:
+        monitor:
+          enabled: true
+          relabelings:
+          - action: replace
+            regex: (.*)
+            replacement: $1
+            sourceLabels:
+            - __meta_kubernetes_pod_node_name
+            targetLabel: kubernetes_node
+

Copy link
Contributor

github-actions bot commented Feb 1, 2025

--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: flux-system/kube-prometheus-stack

+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: flux-system/kube-prometheus-stack

@@ -0,0 +1,38 @@

+---
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  labels:
+    kustomize.toolkit.fluxcd.io/name: cluster-apps
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: kube-prometheus-stack
+  namespace: flux-system
+spec:
+  commonMetadata:
+    labels:
+      app.kubernetes.io/name: kube-prometheus-stack
+  decryption:
+    provider: sops
+    secretRef:
+      name: sops-age
+  dependsOn:
+  - name: onepassword-connect
+  interval: 30m
+  path: ./kubernetes/apps/observability/kube-prometheus-stack/app
+  postBuild:
+    substituteFrom:
+    - kind: ConfigMap
+      name: cluster-settings
+      optional: true
+    - kind: Secret
+      name: cluster-secrets
+      optional: true
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  targetNamespace: observability
+  timeout: 5m
+  wait: false
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack AlertmanagerConfig: observability/alertmanager

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack AlertmanagerConfig: observability/alertmanager

@@ -0,0 +1,90 @@

+---
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: AlertmanagerConfig
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: alertmanager
+  namespace: observability
+spec:
+  inhibitRules:
+  - equal:
+    - alertname
+    - namespace
+    sourceMatch:
+    - matchType: =
+      name: severity
+      value: critical
+    targetMatch:
+    - matchType: =
+      name: severity
+      value: warning
+  receivers:
+  - name: 'null'
+  - name: heartbeat
+    webhookConfigs:
+    - urlSecret:
+        key: ALERTMANAGER_HEARTBEAT_URL
+        name: alertmanager-secret
+  - emailConfigs:
+    - authPassword:
+        key: alertmanager-secret
+        name: ALERTMANAGER_SMTP_PASSWORD
+      authUsername: ..PLACEHOLDER_ALERTMANAGER_SMTP_USERNAME..
+      from: [email protected]_SECRET_DOMAIN..
+      hello: [email protected]_SECRET_DOMAIN..
+      html: |-
+        {{- range .Alerts }}
+          {{- if ne .Annotations.description "" }}
+            {{ .Annotations.description }}
+          {{- else if ne .Annotations.summary "" }}
+            {{ .Annotations.summary }}
+          {{- else if ne .Annotations.message "" }}
+            {{ .Annotations.message }}
+          {{- else }}
+            Alert description not available
+          {{- end }}
+          {{- if gt (len .Labels.SortedPairs) 0 }}
+            <small>
+              {{- range .Labels.SortedPairs }}
+                <b>{{ .Name }}:</b> {{ .Value }}
+              {{- end }}
+            </small>
+          {{- end }}
+        {{- end }}
+      sendResolved: true
+      smarthost: ..PLACEHOLDER_ALERTMANAGER_SMTP_HOST..
+      text: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing
+        | len }}{{ end }}] {{ .CommonLabels.alertname }}'
+      to: [email protected]_SECRET_DOMAIN..
+    name: email
+  route:
+    groupBy:
+    - alertname
+    - job
+    groupInterval: 10m
+    groupWait: 1m
+    receiver: pushover
+    repeatInterval: 12h
+    routes:
+    - matchers:
+      - matchType: =
+        name: alertname
+        value: InfoInhibitor
+      receiver: 'null'
+    - groupInterval: 5m
+      groupWait: 0s
+      matchers:
+      - matchType: =
+        name: alertname
+        value: Watchdog
+      receiver: heartbeat
+      repeatInterval: 5m
+    - matchers:
+      - matchType: =
+        name: severity
+        value: critical
+      receiver: email
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack ExternalSecret: observability/alertmanager

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack ExternalSecret: observability/alertmanager

@@ -0,0 +1,25 @@

+---
+apiVersion: external-secrets.io/v1beta1
+kind: ExternalSecret
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: alertmanager
+  namespace: observability
+spec:
+  dataFrom:
+  - extract:
+      key: alertmanager
+  refreshInterval: 5m
+  secretStoreRef:
+    kind: ClusterSecretStore
+    name: onepassword
+  target:
+    name: alertmanager-secret
+    template:
+      data:
+        ALERTMANAGER_HEARTBEAT_URL: '{{ .ALERTMANAGER_HEARTBEAT_URL }}'
+        ALERTMANAGER_SMTP_PASSWORD: '{{ .ALERTMANAGER_SMTP_PASSWORD }}'
+
--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: flux-system/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

@@ -0,0 +1,164 @@

+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/name: kube-prometheus-stack
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: kube-prometheus-stack
+  namespace: observability
+spec:
+  chart:
+    spec:
+      chart: kube-prometheus-stack
+      sourceRef:
+        kind: HelmRepository
+        name: prometheus-community
+        namespace: flux-system
+      version: 68.4.4
+  dependsOn:
+  - name: kube-prometheus-stack-crds
+    namespace: observability
+  install:
+    crds: Skip
+    remediation:
+      retries: 3
+  interval: 30m
+  upgrade:
+    cleanupOnFail: true
+    crds: Skip
+    remediation:
+      retries: 3
+      strategy: rollback
+  values:
+    additionalPrometheusRulesMap:
+      dockerhub-rules:
+        groups:
+        - name: dockerhub
+          rules:
+          - alert: DockerhubRateLimitRisk
+            annotations:
+              summary: Kubernetes cluster Dockerhub rate limit risk
+            expr: count(time() - container_last_seen{image=~"(docker.io).*",container!=""}
+              < 30) > 100
+            labels:
+              severity: critical
+      oom-rules:
+        groups:
+        - name: oom
+          rules:
+          - alert: OomKilled
+            annotations:
+              summary: Container {{ $labels.container }} in pod {{ $labels.namespace
+                }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the
+                last 10 minutes.
+            expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
+              offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
+              == 1
+            labels:
+              severity: critical
+    alertmanager:
+      alertmanagerSpec:
+        alertmanagerConfiguration:
+          global:
+            resolveTimeout: 5m
+          name: alertmanager
+        externalUrl: https://alertmanager...PLACEHOLDER_SECRET_DOMAIN..
+        storage:
+          volumeClaimTemplate:
+            spec:
+              resources:
+                requests:
+                  storage: 1Gi
+              storageClassName: nfs-provision
+      ingress:
+        enabled: true
+        hosts:
+        - alertmanager...PLACEHOLDER_SECRET_DOMAIN..
+        ingressClassName: internal
+        pathType: Prefix
+    cleanPrometheusOperatorObjectNames: true
+    crds:
+      enabled: false
+    grafana:
+      enabled: false
+      forceDeployDashboards: true
+    kube-state-metrics:
+      fullnameOverride: kube-state-metrics
+      metricLabelsAllowlist:
+      - pods=[*]
+      - deployments=[*]
+      - persistentvolumeclaims=[*]
+      prometheus:
+        monitor:
+          enabled: true
+          relabelings:
+          - action: replace
+            regex: (.*)
+            replacement: $1
+            sourceLabels:
+            - __meta_kubernetes_pod_node_name
+            targetLabel: kubernetes_node
+    kubeApiServer:
+      serviceMonitor:
+        selector:
+          k8s-app: kube-apiserver
+    kubeControllerManager:
+      service:
+        selector:
+          k8s-app: kube-controller-manager
+    kubeEtcd:
+      service:
+        selector:
+          k8s-app: kube-controller-manager
+    kubeProxy:
+      enabled: false
+    kubeScheduler:
+      service:
+        selector:
+          k8s-app: kube-scheduler
+    prometheus:
+      ingress:
+        enabled: true
+        hosts:
+        - prometheus...PLACEHOLDER_SECRET_DOMAIN..
+        ingressClassName: internal
+        pathType: Prefix
+      prometheusSpec:
+        enableAdminAPI: true
+        enableFeatures:
+        - memory-snapshot-on-shutdown
+        podMonitorSelectorNilUsesHelmValues: false
+        probeSelectorNilUsesHelmValues: false
+        resources:
+          limits:
+            memory: 2000Mi
+          requests:
+            cpu: 100m
+        retention: 14d
+        retentionSize: 50GB
+        ruleSelectorNilUsesHelmValues: false
+        scrapeConfigSelectorNilUsesHelmValues: false
+        serviceMonitorSelectorNilUsesHelmValues: false
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              resources:
+                requests:
+                  storage: 50Gi
+              storageClassName: nfs-provision
+        walCompression: true
+    prometheus-node-exporter:
+      fullnameOverride: node-exporter
+      prometheus:
+        monitor:
+          enabled: true
+          relabelings:
+          - action: replace
+            regex: (.*)
+            replacement: $1
+            sourceLabels:
+            - __meta_kubernetes_pod_node_name
+            targetLabel: kubernetes_node
+

@RonaldPhilipsen RonaldPhilipsen merged commit dc9be18 into main Feb 1, 2025
3 of 5 checks passed
@RonaldPhilipsen RonaldPhilipsen deleted the enable-observability branch February 1, 2025 13:56
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant