From 0babfe48f2542518c8045e84c3c2c1aad84c05ef Mon Sep 17 00:00:00 2001 From: Yongun Seong Date: Tue, 4 Feb 2025 22:40:19 +0900 Subject: [PATCH] feat(ferrari): remove, moved into sommelier (#515) --- argocd/ferrari/gpu-operator/app.yaml | 35 - argocd/ferrari/waiter-bootstrap/app.yaml | 23 - .../ferrari/waiter-bootstrap/resources.yaml | 28 - argocd/waiter/ferrari-bootstrap/app.yaml | 25 - .../waiter/ferrari-bootstrap/resources.yaml | 57 - argocd/waiter/gpu-monitoring/app.yaml | 24 - .../gpu-monitoring-dashboard.json | 3156 ----------------- .../waiter/gpu-monitoring/kustomization.yaml | 11 - .../waiter/gpu-monitoring/scrapeconfigs.yaml | 48 - 9 files changed, 3407 deletions(-) delete mode 100644 argocd/ferrari/gpu-operator/app.yaml delete mode 100644 argocd/ferrari/waiter-bootstrap/app.yaml delete mode 100644 argocd/ferrari/waiter-bootstrap/resources.yaml delete mode 100644 argocd/waiter/ferrari-bootstrap/app.yaml delete mode 100644 argocd/waiter/ferrari-bootstrap/resources.yaml delete mode 100644 argocd/waiter/gpu-monitoring/app.yaml delete mode 100644 argocd/waiter/gpu-monitoring/gpu-monitoring-dashboard.json delete mode 100644 argocd/waiter/gpu-monitoring/kustomization.yaml delete mode 100644 argocd/waiter/gpu-monitoring/scrapeconfigs.yaml diff --git a/argocd/ferrari/gpu-operator/app.yaml b/argocd/ferrari/gpu-operator/app.yaml deleted file mode 100644 index 4cbda843..00000000 --- a/argocd/ferrari/gpu-operator/app.yaml +++ /dev/null @@ -1,35 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - namespace: argocd - name: ferrari-gpu-operator - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: ferrari - destination: - name: ferrari - namespace: gpu-operator - syncPolicy: - #automated: - # prune: true - # selfHeal: true - syncOptions: - - CreateNamespace=true - sources: - - repoURL: https://helm.ngc.nvidia.com/nvidia - targetRevision: 23.3.2 - chart: gpu-operator - helm: - values: | - mig: - strategy: mixed - toolkit: - env: - - name: CONTAINERD_CONFIG - value: /etc/k0s/containerd.toml - - name: CONTAINERD_SOCKET - value: /run/k0s/containerd.sock - # also need to manually patch dcgm-exporter DS to add env - # DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE=uid diff --git a/argocd/ferrari/waiter-bootstrap/app.yaml b/argocd/ferrari/waiter-bootstrap/app.yaml deleted file mode 100644 index 88b34948..00000000 --- a/argocd/ferrari/waiter-bootstrap/app.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - namespace: argocd - name: ferrari-waiter-bootstrap - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: ferrari - destination: - name: ferrari - namespace: default - syncPolicy: - #automated: - # prune: true - # selfHeal: true - sources: - - repoURL: https://github.com/bacchus-snu/cd-manifests.git - targetRevision: main - path: argocd/ferrari/waiter-bootstrap - directory: - include: 'resources.yaml' diff --git a/argocd/ferrari/waiter-bootstrap/resources.yaml b/argocd/ferrari/waiter-bootstrap/resources.yaml deleted file mode 100644 index f7d544c7..00000000 --- a/argocd/ferrari/waiter-bootstrap/resources.yaml +++ /dev/null @@ -1,28 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - namespace: default - name: waiter-bootstrap ---- -apiVersion: v1 -kind: Secret -metadata: - namespace: default - name: waiter-bootstrap - annotations: - kubernetes.io/service-account.name: waiter-bootstrap -type: kubernetes.io/service-account-token ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: waiter-bootstrap -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: cluster-admin -subjects: - - kind: ServiceAccount - name: waiter-bootstrap - namespace: default diff --git a/argocd/waiter/ferrari-bootstrap/app.yaml b/argocd/waiter/ferrari-bootstrap/app.yaml deleted file mode 100644 index fc7ddabe..00000000 --- a/argocd/waiter/ferrari-bootstrap/app.yaml +++ /dev/null @@ -1,25 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - namespace: argocd - name: ferrari-bootstrap - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - destination: - name: in-cluster - namespace: ferrari-bootstrap - syncPolicy: - #automated: - # prune: true - # selfHeal: true - syncOptions: - - CreateNamespace=true - sources: - - repoURL: https://github.com/bacchus-snu/cd-manifests.git - targetRevision: main - path: argocd/waiter/ferrari-bootstrap - directory: - include: 'resources.yaml' diff --git a/argocd/waiter/ferrari-bootstrap/resources.yaml b/argocd/waiter/ferrari-bootstrap/resources.yaml deleted file mode 100644 index 67047d56..00000000 --- a/argocd/waiter/ferrari-bootstrap/resources.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Argocd-related configurations ---- -apiVersion: argoproj.io/v1alpha1 -kind: AppProject -metadata: - namespace: argocd - name: ferrari -spec: - destinations: - - name: in-cluster - namespace: '*' - server: https://kubernetes.default.svc - - name: ferrari - namespace: '*' - server: https://ferrari.snucse.org:6443 - clusterResourceWhitelist: - - group: '*' - kind: '*' - namespaceResourceWhitelist: - - group: '*' - kind: '*' - sourceRepos: - - '*' ---- -apiVersion: secrets.hashicorp.com/v1beta1 -kind: VaultStaticSecret -metadata: - namespace: argocd - name: argocd-clusters-ferrari -spec: - vaultAuthRef: default - type: kv-v2 - mount: secret - path: argocd/clusters/ferrari - destination: - create: true - name: argocd-clusters-ferrari - labels: - argocd.argoproj.io/secret-type: cluster ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - namespace: argocd - name: ferrari-top-level -spec: - project: ferrari - destination: - name: in-cluster - namespace: argocd - source: - repoURL: https://github.com/bacchus-snu/cd-manifests.git - targetRevision: main - path: argocd/ferrari - directory: - recurse: true - include: '*/app.yaml' diff --git a/argocd/waiter/gpu-monitoring/app.yaml b/argocd/waiter/gpu-monitoring/app.yaml deleted file mode 100644 index 5f70069f..00000000 --- a/argocd/waiter/gpu-monitoring/app.yaml +++ /dev/null @@ -1,24 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - namespace: argocd - name: gpu-monitoring - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - destination: - name: in-cluster - namespace: gpu-monitoring - syncPolicy: - #automated: - # prune: true - # selfHeal: true - syncOptions: - - CreateNamespace=true - sources: - - repoURL: https://github.com/bacchus-snu/cd-manifests.git - targetRevision: main - path: argocd/waiter/gpu-monitoring - kustomize: {} diff --git a/argocd/waiter/gpu-monitoring/gpu-monitoring-dashboard.json b/argocd/waiter/gpu-monitoring/gpu-monitoring-dashboard.json deleted file mode 100644 index 4ca0c609..00000000 --- a/argocd/waiter/gpu-monitoring/gpu-monitoring-dashboard.json +++ /dev/null @@ -1,3156 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "panels": [], - "repeat": "instance", - "repeatDirection": "h", - "title": "Node Overview - $instance", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "dtdurations", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 0, - "y": 1 - }, - "id": 3, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "node_time_seconds{job=\"node-exporter\", instance=~\"$instance\"}\n -\nnode_boot_time_seconds{job=\"node-exporter\", instance=~\"$instance\"}", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Uptime", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.85 - }, - { - "color": "#E24D42", - "value": 0.95 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 3, - "y": 1 - }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\", mode!=\"idle\"}[$__rate_interval]))\n /\nsum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\"}[$__rate_interval]))", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "CPU Util", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.8 - }, - { - "color": "red", - "value": 0.9 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 5, - "y": 1 - }, - "id": 7, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "1 -\nnode_memory_MemAvailable_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n /\nnode_memory_MemTotal_bytes{job=\"node-exporter\", instance=~\"$instance\"}", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Memory Used", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.8 - }, - { - "color": "red", - "value": 0.9 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 7, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "1 -\nnode_filesystem_avail_bytes{job=\"node-exporter\", instance=~\"$instance\", mountpoint=\"/\"}\n /\nnode_filesystem_size_bytes{job=\"node-exporter\", instance=~\"$instance\", mountpoint=\"/\"}", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "RootFS Used", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.8 - }, - { - "color": "red", - "value": 0.9 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 9, - "y": 1 - }, - "id": 9, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "1 -\nmax(node_filesystem_avail_bytes{job=\"node-exporter\", instance=~\"$instance\", device=\"zpool\"})\n /\nmin(node_filesystem_size_bytes{job=\"node-exporter\", instance=~\"$instance\", device=\"zpool\"})", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "zpool Used", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 85 - }, - { - "color": "#E24D42", - "value": 95 - } - ] - }, - "unit": "percent", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 11, - "y": 1 - }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "avg(DCGM_FI_DEV_GPU_UTIL{job=\"gpu-dcgm-exporter\", instance=~\"$instance\"})", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "GPU Util", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 2, - "x": 13, - "y": 1 - }, - "id": 6, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Pending\"})", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Pending Pods", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "red", - "value": 4 - }, - { - "color": "#EAB839", - "value": 16 - }, - { - "color": "green", - "value": 32 - } - ] - }, - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 15, - "y": 1 - }, - "id": 26, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"})\n -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"}) by (namespace, pod)\n)", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "vCPU Allocatable", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "red", - "value": 4096 - }, - { - "color": "orange", - "value": 8192 - }, - { - "color": "green", - "value": 16384 - } - ] - }, - "unit": "bytes", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 18, - "y": 1 - }, - "id": 27, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"})\n -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"}) by (namespace, pod)\n)", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Memory Allocatable", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "#EAB839", - "value": 2 - }, - { - "color": "green", - "value": 4 - } - ] - }, - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 21, - "y": 1 - }, - "id": 28, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n)", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "GPU Allocatable", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "percent" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 2, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 5 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\", mode!=\"idle\"}[$__rate_interval])) by (mode)\n / ignoring (mode) group_left()\nsum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\"}[$__rate_interval]))", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\", mode=\"idle\"}[$__rate_interval]))\n /\nsum(rate(node_cpu_seconds_total{job=\"node-exporter\", instance=~\"$instance\"}[$__rate_interval]))", - "instant": false, - "legendFormat": "idle", - "range": true, - "refId": "A" - } - ], - "title": "CPU Utilization", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes", - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": "A", - "mode": "none" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 5 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\", instance=~\"$instance\"}", - "instant": false, - "legendFormat": "Total", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n -\nnode_memory_MemFree_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n -\nnode_memory_Cached_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n -\nnode_memory_Buffers_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n -\nnode_memory_SReclaimable_bytes{job=\"node-exporter\", instance=~\"$instance\"}", - "hide": false, - "instant": false, - "legendFormat": "Used", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=~\"$instance\"}", - "hide": false, - "instant": false, - "legendFormat": "Free", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n +\nnode_memory_Buffers_bytes{job=\"node-exporter\", instance=~\"$instance\"}\n +\nnode_memory_SReclaimable_bytes{job=\"node-exporter\", instance=~\"$instance\"}", - "hide": false, - "instant": false, - "legendFormat": "Cache + Buffer", - "range": true, - "refId": "D" - } - ], - "title": "Memory Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 5 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "1 -\nnode_filesystem_avail_bytes{job=\"node-exporter\", instance=~\"$instance\", mountpoint=\"/\"}\n /\nnode_filesystem_size_bytes{job=\"node-exporter\", instance=~\"$instance\", mountpoint=\"/\"}", - "instant": false, - "legendFormat": "RootFS", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "1 -\nmax(node_filesystem_avail_bytes{job=\"node-exporter\", instance=~\"$instance\", device=\"zpool\"})\n /\nmin(node_filesystem_size_bytes{job=\"node-exporter\", instance=~\"$instance\", device=\"zpool\"})", - "hide": false, - "instant": false, - "legendFormat": "zpool", - "range": true, - "refId": "B" - } - ], - "title": "Disk Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "binBps", - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "trans .*" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 11 - }, - "id": 13, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=~\"$instance\", device=~\"ens.*\"}[$__rate_interval])) by (device) > 0", - "hide": false, - "instant": false, - "legendFormat": "recv {{device}}", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=~\"$instance\", device=~\"ens.*\"}[$__rate_interval])) by (device) > 0", - "instant": false, - "legendFormat": "trans {{device}}", - "range": true, - "refId": "A" - } - ], - "title": "Network Traffic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 11 - }, - "id": 14, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(node_hwmon_power_average_watt{job=\"node-exporter\", instance=~\"$instance\"}) by (chip, sensor)", - "instant": false, - "legendFormat": "{{chip}} {{sensor}}", - "range": true, - "refId": "A" - } - ], - "title": "hwmon Power Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 11 - }, - "id": 16, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(node_hwmon_temp_celsius{job=\"node-exporter\", instance=~\"$instance\"}) by (chip, sensor)\n * on (chip) group_left(chip_name)\nnode_hwmon_chip_names{job=\"node-exporter\", instance=~\"$instance\"}\n * on (chip, sensor) group_left(label)\nnode_hwmon_sensor_label{job=\"node-exporter\", instance=~\"$instance\"}", - "instant": false, - "legendFormat": "{{chip_name}} {{label}}", - "range": true, - "refId": "A" - } - ], - "title": "hwmon Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 17 - }, - "id": 18, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{job=\"gpu-dcgm-exporter\", instance=~\"$instance\"}) by (gpu)", - "instant": false, - "legendFormat": "GPU {{gpu}}", - "range": true, - "refId": "A" - } - ], - "title": "DCGM GPU Utilization", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 17 - }, - "id": 15, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{job=\"gpu-dcgm-exporter\", instance=~\"$instance\"}) by (gpu)", - "instant": false, - "legendFormat": "GPU {{gpu}}", - "range": true, - "refId": "A" - } - ], - "title": "DCGM Power Usage", - "transformations": [], - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 17 - }, - "id": 17, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(DCGM_FI_DEV_GPU_TEMP{job=\"gpu-dcgm-exporter\", instance=~\"$instance\"}) by (gpu)", - "instant": false, - "legendFormat": "GPU {{gpu}}", - "range": true, - "refId": "A" - } - ], - "title": "DCGM Temperature", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 23 - }, - "id": 19, - "panels": [], - "title": "Kubernetes - $instance", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": "A", - "mode": "none" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "shades" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 0, - "y": 24 - }, - "id": 21, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"}) by (namespace, pod)\n) by (namespace)", - "hide": false, - "instant": false, - "legendFormat": "{{namespace}}", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"}", - "hide": false, - "instant": false, - "legendFormat": "Total", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"})\n -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"cpu\"}) by (namespace, pod)\n)", - "hide": false, - "instant": false, - "legendFormat": "Free", - "range": true, - "refId": "C" - } - ], - "title": "CPU Allocation", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes", - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": "A", - "mode": "none" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 6, - "y": 24 - }, - "id": 22, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"}) by (namespace, pod)\n) by (namespace)", - "instant": false, - "legendFormat": "{{namespace}}", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"}", - "hide": false, - "instant": false, - "legendFormat": "Total", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"})\n -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"memory\"}) by (namespace, pod)\n)", - "hide": false, - "instant": false, - "legendFormat": "Free", - "range": true, - "refId": "C" - } - ], - "title": "Memory Allocation", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": "A", - "mode": "none" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 12, - "y": 24 - }, - "id": 23, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n) by (namespace)", - "instant": false, - "legendFormat": "{{namespace}}", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}", - "hide": false, - "instant": false, - "legendFormat": "Total", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(kube_node_status_allocatable{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) -\nsum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n)", - "hide": false, - "instant": false, - "legendFormat": "Free", - "range": true, - "refId": "C" - } - ], - "title": "GPU Allocation", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "dtdurations", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 24 - }, - "id": 24, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "1w" - } - ] - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time(\n sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n ) by (namespace) [24h:1m]\n) * 60 > 0", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time(\n sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n ) by (namespace) [1w:1m]\n) * 60 > 0", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time(\n sum(\n kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Running\"}\n * on (namespace, pod)\n sum(kube_pod_container_resource_requests{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n ) by (namespace) [30d:1m]\n) * 60 > 0", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "C" - } - ], - "title": "Total GPU-Hours", - "transformations": [ - { - "id": "merge", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": {}, - "renameByName": { - "Value #A": "24hr", - "Value #B": "1w", - "Value #C": "30d" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "dtdurations", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 29 - }, - "id": 29, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "frameIndex": 0, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "waiting for" - } - ] - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(sum_over_time(kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Pending\"}[24h:1m])) by (namespace, pod)\n * on (namespace, pod)\n(kube_pod_status_phase{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", phase=\"Pending\"} > 0)\n* 60", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "Pending Pods", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "Value": false, - "__name__": true, - "instance": true, - "job": true, - "phase": true, - "uid": true - }, - "indexByName": {}, - "renameByName": { - "Value": "pending for", - "pod": "" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "filterable": false, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": ".*%" - }, - "properties": [ - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.8 - }, - { - "color": "red", - "value": 0.9 - } - ] - } - }, - { - "id": "color" - }, - { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge" - } - }, - { - "id": "max", - "value": 1 - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "memory.*[^%]" - }, - "properties": [ - { - "id": "unit", - "value": "bytes" - } - ] - } - ] - }, - "gridPos": { - "h": 11, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 25, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": false, - "fields": [], - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "gpu %" - } - ] - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.cpu\", type=\"hard\"}) by (namespace)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "cpu hard" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.cpu\", type=\"used\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "cpu used" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.cpu\", type=\"used\"}) by (namespace)\n /\nsum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.cpu\", type=\"hard\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "cpu %" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.memory\", type=\"hard\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "memory hard" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.memory\", type=\"used\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "memory used" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.memory\", type=\"used\"}) by (namespace)\n /\nsum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.memory\", type=\"hard\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "memory %" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.nvidia.com/gpu\", type=\"hard\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "gpu hard" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.nvidia.com/gpu\", type=\"used\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "gpu used" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.nvidia.com/gpu\", type=\"used\"}) by (namespace)\n /\nsum(kube_resourcequota{job=\"gpu-kube-state-metrics\", instance=~\"$instance\", resource=\"requests.nvidia.com/gpu\", type=\"hard\"}) by (namespace)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "gpu %" - } - ], - "title": "Quota Usage", - "transformations": [ - { - "id": "merge", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": {}, - "renameByName": { - "namespace": "" - } - } - }, - { - "id": "renameByRegex", - "options": { - "regex": ".* #(.*)", - "renamePattern": "$1" - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none", - "unitScale": true - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 45 - }, - "id": 30, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)\n -\n(count(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\"} > 10) by (namespace))\n or\nsum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)", - "instant": false, - "legendFormat": "{{namespace}}", - "range": true, - "refId": "A" - } - ], - "title": "Wasted GPUs", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "dtdurations", - "unitScale": true - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "1w %" - }, - "properties": [ - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge" - } - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 0.5 - }, - { - "color": "red", - "value": 0.8 - } - ] - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 45 - }, - "id": 32, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "1w" - } - ] - }, - "pluginVersion": "10.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time((sum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)\n -\n(count(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\"} > 10) by (namespace))\n or\nsum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace))[24h:1m]) * 60", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time((sum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)\n -\n(count(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\"} > 10) by (namespace))\n or\nsum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace))[1w:1m]) * 60", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum_over_time((sum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)\n -\n(count(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\"} > 10) by (namespace))\n or\nsum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace))[30d:1m]) * 60", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "1 -\nsum_over_time(count(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\"} > 10) by (namespace)[1w:1m])\n /\nsum_over_time(sum(\n sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (namespace, pod)\n * on (namespace, pod)\n sum(kube_pod_status_phase{phase=\"Running\", job=\"gpu-kube-state-metrics\"}) by (namespace, pod)\n) by (namespace)[1w:1m])", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "D" - } - ], - "title": "Total Wasted GPU-Hours", - "transformations": [ - { - "id": "merge", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": { - "Time": 0, - "Value #A": 2, - "Value #B": 3, - "Value #C": 5, - "Value #D": 4, - "namespace": 1 - }, - "renameByName": { - "Value #A": "24h", - "Value #B": "1w", - "Value #C": "30d", - "Value #D": "1w %" - } - } - } - ], - "type": "table" - } - ], - "refresh": "1m", - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "Prometheus", - "value": "prometheus" - }, - "hide": 0, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "current": { - "selected": true, - "text": [ - "ferrari.snucse.org:30080" - ], - "value": [ - "ferrari.snucse.org:30080" - ] - }, - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "definition": "label_values(up{job=\"gpu-kube-state-metrics\"},instance)", - "hide": 0, - "includeAll": true, - "multi": true, - "name": "instance", - "options": [], - "query": { - "query": "label_values(up{job=\"gpu-kube-state-metrics\"},instance)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "SNUCSE GPU Service", - "uid": "add7ee94-d375-4a26-b7df-405efd14e014", - "version": 1, - "weekStart": "" -} diff --git a/argocd/waiter/gpu-monitoring/kustomization.yaml b/argocd/waiter/gpu-monitoring/kustomization.yaml deleted file mode 100644 index 87a07d73..00000000 --- a/argocd/waiter/gpu-monitoring/kustomization.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - - scrapeconfigs.yaml - -configMapGenerator: - - name: gpu-monitoring-dashboard - files: - - gpu-monitoring-dashboard.json - options: - disableNameSuffixHash: true - labels: - grafana_dashboard: '1' diff --git a/argocd/waiter/gpu-monitoring/scrapeconfigs.yaml b/argocd/waiter/gpu-monitoring/scrapeconfigs.yaml deleted file mode 100644 index 4baa41fe..00000000 --- a/argocd/waiter/gpu-monitoring/scrapeconfigs.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ScrapeConfig -metadata: - namespace: gpu-monitoring - name: gpu-kube-state-metrics - labels: - release: dashboard -spec: - metricsPath: /kube-state/metrics - staticConfigs: - - labels: - job: gpu-kube-state-metrics - targets: - - ferrari.snucse.org:30080 ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ScrapeConfig -metadata: - namespace: gpu-monitoring - name: gpu-node-exporter - labels: - release: dashboard -spec: - metricsPath: /node-exporter/metrics - staticConfigs: - # use the same label as the in-cluster node-expoter, as node-exporter only - # has node-level metrics and thus the metrics cannot collide with - # in-cluster nodes - - labels: - job: node-exporter - targets: - - ferrari.snucse.org:30080 ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ScrapeConfig -metadata: - namespace: gpu-monitoring - name: gpu-dcgm-exporter - labels: - release: dashboard -spec: - metricsPath: /dcgm-exporter/metrics - staticConfigs: - - labels: - job: gpu-dcgm-exporter - targets: - - ferrari.snucse.org:30080