Backend: Integrate alert manager to monitoring setup (#3507)

* add view name to statsd metrics * cleanup unncessary files with gitignore and fix import style * switch to views instead of endpoint for metrics * fix import order * rebase on top of statsd * add pushgateway for django-prometheus and export sample push custom metrics * add monitoring for submission queue using prometheus counters * fix staging configs and add labels to queue counters * fix staging configs and add labels to queue counters * fix tests for submission worker * push metrics to gateway using prometheus client * cleanup unncessary files * fix push metrics key for submission worker * fix import order * cleanup unncessary files with gitignore * change counters name for queue monitoring * add view name to statsd metrics * cleanup unncessary files with gitignore and fix import style * add grafana env variables for staging * add configs to deploy gateway and nodeexporter to staging setup * fix configs for staging server * fix staging configs and add labels to queue counters * Integrate alert manager to monitoring setup * fix configs for staging server * add newlines to files and update gitignore * fix configs and alert templates * add secret file for alertmanager config for staging * cleanup unncessary files * cleanup unncessary files with gitignore * fix alertmanager rules for statsd metrics * fix dev uwsgi settings * fix route for exposing alertmanager on staging * rebase on top of node-exporter * update auto-deploy command * rebase on top of master * fix alert rules * change alert message for api threshold * revert changes for setting alertmanager on same instance * fix configs of alertmanager for new nginx-ingress * fix indentation of autodeployment script * add auto deploy commands to setup alertmanager * rebase after removing pushgateway * fix alertmanager configs to keep dev and staging consistent * fix alertmanager route names * remove default alertmanager endpoint configs * cleanup files * add default receiver to alertmanager * change name of alertmanager config file Co-authored-by: Rishabh Jain <[email protected]>
Cloud-CV · Aug 17, 2021 · bc7b8eb · bc7b8eb
1 parent c9b05b5
commit bc7b8eb
Show file tree

Hide file tree

Showing 12 changed files with 164 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -71,3 +71,5 @@ out/
 monitoring/prometheus/prometheus_db/*
 !monitoring/prometheus/prometheus_db/.gitkeep
 monitoring/grafana/grafana_db/grafana.db
+monitoring/alertmanager/data/*
+!monitoring/alertmanager/data/.gitkeep
diff --git a/docker-compose-monitoring.yml b/docker-compose-monitoring.yml
@@ -5,6 +5,7 @@ services:
     user: "1000"
     volumes:
       - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./monitoring/prometheus/rules.yml:/etc/rules/rules.yml
       - ./monitoring/prometheus/prometheus_db:/var/lib/prometheus
       - ./monitoring/prometheus/prometheus_db:/prometheus
       - ./monitoring/prometheus/prometheus_db:/etc/prometheus
@@ -30,3 +31,18 @@ services:
     image: prom/node-exporter
     ports:
       - "9100:9100"
+
+  alert-manager:
+    hostname: alert_manager
+    image: prom/alertmanager
+    user: "1000"
+    volumes:
+      - ./monitoring/prometheus:/prometheus
+      - ./monitoring/alertmanager/data:/data
+      - ./monitoring/alertmanager/templates:/etc/alertmanager/templates
+    command:
+      - '--config.file=/prometheus/alert_manager.yml'
+      - '--storage.path=/data'
+      - '--web.external-url=http://localhost:9093/alert_manager'
+    ports:
+      - 9093:9093
diff --git a/docker-compose-staging.yml b/docker-compose-staging.yml
@@ -101,6 +101,7 @@ services:
     user: "1000"
     volumes:
       - ./monitoring/prometheus/prometheus_staging.yml:/etc/prometheus/prometheus.yml
+      - ./monitoring/prometheus/rules.yml:/etc/rules/rules.yml
       - ./monitoring/prometheus/prometheus_db:/var/lib/prometheus
       - ./monitoring/prometheus/prometheus_db:/prometheus
       - ./monitoring/prometheus/prometheus_db:/etc/prometheus
@@ -151,6 +152,22 @@ services:
       - prometheus
       - grafana
       - statsd-exporter
+      - alert-manager
     ports:
       - '80:80'
       - '443:443'
+
+  alert-manager:
+    hostname: alert_manager
+    image: prom/alertmanager
+    user: "1000"
+    volumes:
+      - ./monitoring/prometheus:/prometheus
+      - ./monitoring/alertmanager/data:/data
+      - ./monitoring/alertmanager/templates:/etc/alertmanager/templates
+    command:
+      - '--config.file=/prometheus/alert_manager.yml'
+      - '--storage.path=/data'
+      - '--web.external-url=http://localhost:9093/alert_manager'
+    ports:
+      - '9093:9093'
diff --git a/docker/prod/nginx-ingress/nginx_staging.conf b/docker/prod/nginx-ingress/nginx_staging.conf
@@ -10,6 +10,10 @@ upstream statsd_exporter {
   server statsd:9102 fail_timeout=0;
 }
 
+upstream alert_manager {
+  server alert_manager:9093 fail_timeout=0;
+}
+
 server {
   server_name monitoring-staging.eval.ai;
   listen 80;
@@ -54,4 +58,11 @@ server {
     proxy_set_header X-Forwarded-Proto $scheme;
     proxy_pass http://statsd_exporter;
   }
+
+  location /alert_manager {
+    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+    proxy_set_header Host $host;
+    proxy_set_header X-Forwarded-Proto $scheme;
+    proxy_pass http://alert_manager;
+  }
 }
diff --git a/monitoring/alertmanager/data/.gitkeep b/monitoring/alertmanager/data/.gitkeep
diff --git a/monitoring/alertmanager/templates/apis.tmpl b/monitoring/alertmanager/templates/apis.tmpl
@@ -0,0 +1,16 @@
+{{ define "slack.apis.title" -}}
+    {{- if .CommonAnnotations.title -}}
+        {{- .CommonAnnotations.title -}}
+    {{- else -}}
+        API-Threshold-Exceeded
+    {{- end -}}
+{{- end }}
+{{ define "slack.apis.text" -}}
+    {{- if .CommonAnnotations.description -}}
+        {{- .CommonAnnotations.description -}}
+    {{- else -}}
+        {{- range $i, $alert := .Alerts }}
+            {{- "\n" -}}{{- .Annotations.description -}}
+        {{- end -}}
+    {{- end -}}
+{{- end }}
diff --git a/monitoring/alertmanager/templates/instances.tmpl b/monitoring/alertmanager/templates/instances.tmpl
@@ -0,0 +1,18 @@
+{{ define "slack.instances.title" -}}
+    {{- if .CommonAnnotations.title -}}
+        {{- .CommonAnnotations.title -}}
+    {{- else -}}
+        {{- with index .Alerts 0 -}}
+            {{- .Annotations.title -}}
+        {{- end -}}
+    {{- end -}}
+{{- end }}
+{{ define "slack.instances.text" -}}
+    {{- if .CommonAnnotations.description -}}
+        {{- .CommonAnnotations.description -}}
+    {{- else -}}
+        {{- range $i, $alert := .Alerts }}
+            {{- "\n" -}}{{- .Annotations.description -}}
+        {{- end -}}
+    {{- end -}}
+{{- end }}
diff --git a/monitoring/prometheus/alert_manager.yml b/monitoring/prometheus/alert_manager.yml
@@ -0,0 +1,31 @@
+global:
+  resolve_timeout: 10s
+  slack_api_url: 'https://hooks.slack.com/services/x/x/x'
+
+route:
+  receiver: 'slack-apis-notifications'
+  group_interval: 10s
+  repeat_interval: 10s
+  routes:
+  - matchers: [group = api]
+    receiver: 'slack-apis-notifications'
+  - matchers: [group = instance]
+    receiver: 'slack-instance-notifications'
+
+receivers:
+  - name: 'slack-apis-notifications'
+    slack_configs:
+      - channel: '#x'
+        title: '{{ template "slack.apis.title" . }}'
+        text: '{{ template "slack.apis.text" . }}'
+        send_resolved: false
+  - name: 'slack-instance-notifications'
+    slack_configs:
+      - channel: '#x'
+        title: '{{ template "slack.instances.title" . }}'
+        text: '{{ template "slack.instances.text" . }}'
+        send_resolved: false
+
+templates:
+- '/etc/alertmanager/templates/instances.tmpl'
+- '/etc/alertmanager/templates/apis.tmpl'
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
@@ -2,6 +2,10 @@ global:
   scrape_interval: 30s
   external_labels:
     monitor: 'evalai-monitor'
+
+rule_files:
+  - /etc/rules/rules.yml
+
 scrape_configs:
   - job_name: 'prometheus' 
     static_configs: 
@@ -16,3 +20,9 @@ scrape_configs:
     metrics_path: '/metrics'
     static_configs:
       - targets: ['node_exporter:9100']
+
+alerting:
+  alertmanagers:
+  - path_prefix: '/alert_manager'
+    static_configs:
+    - targets: ['alert_manager:9093']
diff --git a/monitoring/prometheus/prometheus_staging.yml b/monitoring/prometheus/prometheus_staging.yml
@@ -3,6 +3,9 @@ global:
   external_labels:
     monitor: 'evalai-monitor'
 
+rule_files:
+  - /etc/rules/rules.yml
+
 scrape_configs:
   - job_name: 'prometheus' 
     metrics_path: '/prometheus/metrics'
@@ -18,3 +21,9 @@ scrape_configs:
     metrics_path: '/node_exporter'
     static_configs:
       - targets: ['staging.eval.ai']
+
+alerting:
+  alertmanagers:
+  - path_prefix: '/alert_manager'
+    static_configs:
+    - targets: ['monitoring-staging.eval.ai']
diff --git a/monitoring/prometheus/rules.yml b/monitoring/prometheus/rules.yml
@@ -0,0 +1,24 @@
+groups:
+- name: API-Threshold-Exceeded
+  rules:
+  - alert: API-Threshold-Exceeded-5XX
+    expr: rate(django_request_count{job="statsd",method="GET",status=~"5..",view=~"jobs:.*|challenges:.*"}[5m]) > 0.3
+    for: 5m
+    annotations:
+      title: 'API-Threshold-Exceeded - 5XX'
+      description: '•*{{ $labels.view }}* had *{{ $value | printf "%.1f" }}* QPS rate with the response code of *5XX* in the last 5 minutes'
+    labels:
+      severity: 'critical'
+      group: 'api'
+
+- name: Instance-Status
+  rules:
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    annotations:
+      title: "Instance(s) Down"
+      description: "•*{{ $labels.instance }}* of prometheus job *{{ $labels.job }}* has been down for more than 5 minutes"
+    labels:
+      severity: major
+      group: 'instance'
diff --git a/scripts/deployment/deploy.sh b/scripts/deployment/deploy.sh
@@ -58,9 +58,10 @@ case $opt in
 					export COMMIT_ID=${COMMIT_ID}
 					eval $(aws ecr get-login --no-include-email)
 					aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env
+                    			aws s3 cp s3://cloudcv-secrets/evalai/${env}/alert_manager.yml ./monitoring/prometheus/alert_manager.yml
 					docker-compose -f docker-compose-${env}.yml rm -s -v -f
-					docker-compose -f docker-compose-${env}.yml pull nginx-ingress prometheus grafana statsd-exporter
-					docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans nginx-ingress prometheus grafana statsd-exporter
+					docker-compose -f docker-compose-${env}.yml pull nginx-ingress prometheus grafana statsd-exporter alert-manager
+					docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans nginx-ingress prometheus grafana statsd-exporter alert-manager
 				ENDSSH2
 			ENDSSH
             ;;
@@ -168,6 +169,11 @@ case $opt in
             docker-compose -f docker-compose-${env}.yml up -d node_exporter
             echo "Completed deploy operation."
             ;;
+        deploy-alert-manager)
+            echo "Deploying alertmanager docker container..."
+            docker-compose -f docker-compose-${env}.yml up -d alert-manager
+            echo "Completed deploy operation."
+            ;;
         scale)
             service=${3}
             instances=${4}
@@ -213,6 +219,8 @@ case $opt in
         echo "        Eg. ./scripts/deployment/deploy.sh deploy-statsd production"
         echo "    deploy-node-exporter : Deploy node_exporter container in the respective environment."
         echo "        Eg. ./scripts/deployment/deploy.sh deploy-node-exporter production"
+        echo "    deploy-alert-manager : Deploy alertmanager container in the respective environment."
+        echo "        Eg. ./scripts/deployment/deploy.sh deploy-alert-manager production"
         echo "    scale  : Scale particular docker service in an environment."
         echo "        Eg. ./scripts/deployment/deploy.sh scale production django 5"
         echo "    clean  : Remove all docker containers and images."