helm(major): add prometheus rules, add switch to enable/disable monitoring

This commit is contained in:
Langhammer, Jens 2019-11-08 13:49:28 +01:00
parent 3d73aac3ab
commit 2c0f0a68a8
7 changed files with 219 additions and 15 deletions

View file

@ -0,0 +1,10 @@
ingress:
enabled: true
hosts:
- some.address.tld
grafana.ini:
auth.anonymous:
enabled: true
org_name: Main Org.
org_role: Viewer

View file

@ -0,0 +1,63 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prod-passbook-ng
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
spec:
serviceAccountName: prometheus
serviceMonitorSelector:
matchLabels:
app.kubernetes.io/name: passbook
enableAdminAPI: false
ruleSelector:
matchLabels:
app.kubernetes.io/name: passbook
storage:
volumeClaimTemplate:
metadata:
labels:
prometheus: k8s
name: prometheus-storage
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 15Gi

View file

@ -0,0 +1,124 @@
{{- if .Values.monitoring.enabled -}}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "passbook.fullname" . }}-static-rules
labels:
app.kubernetes.io/name: {{ include "passbook.name" . }}
helm.sh/chart: {{ include "passbook.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
groups:
- name: Aggregate request counters
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) by (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) by (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) by (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) by (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) by (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) by (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) by (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) by (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job,method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) by (job,transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view,transport,method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) by (job,templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) by (job,status)
- record: job:django_http_responses_total_by_status_name_method:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status_name_method[30s])) by (job,status,name,method)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) by (job,charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) by (job,type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) by (job,view)
- name: Aggregate latency histograms
rules:
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: "99.9"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: "99.9"
- name: Aggregate model operations
rules:
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) by (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) by (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) by (job, model)
- name: Aggregate database operations
rules:
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) by (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) by (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) by (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) by (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) by (alias, vendor, type)
- name: Aggregate migrations
rules:
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) by (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) by (job, connection)
- name: Alerts
rules:
- alert: UnappliedMigrations
expr: job:django_migrations_unapplied_total:max > 0
for: 1m
labels:
severity: testing
annotations:
summary: "Unapplied django migrations on {{$labels.connection}}"
description: "Django detected {{$value}} unapplied migrations on database {{$labels.connection}}"
{{- end }}

View file

@ -1,3 +1,4 @@
{{- if .Values.monitoring.enabled -}}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -13,3 +14,4 @@ spec:
selector:
matchLabels:
k8s.passbook.io/component: static
{{- end }}

View file

@ -9,9 +9,9 @@ metadata:
helm.sh/chart: {{ include "passbook.chart" . }}
k8s.passbook.io/component: web
spec:
type: {{ .Values.service.type }}
type: ClusterIP
ports:
- port: {{ .Values.service.port }}
- port: 80
targetPort: http
protocol: TCP
name: http

View file

@ -1,3 +1,4 @@
{{- if .Values.monitoring.enabled -}}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -17,6 +18,8 @@ spec:
name: {{ include "passbook.fullname" . }}-secret-key
key: monitoring_username
port: http
interval: 10s
selector:
matchLabels:
k8s.passbook.io/component: web
{{- end }}

View file

@ -14,19 +14,10 @@ config:
email:
host: localhost
postgresql:
postgresqlDatabase: passbook
redis:
cluster:
enabled: false
master:
persistence:
enabled: false
service:
type: ClusterIP
port: 80
# This Helm chart ships with built-in Prometheus ServiceMonitors and Rules.
# This requires the CoreOS Prometheus Operator.
monitoring:
enabled: false
ingress:
enabled: false
@ -40,3 +31,14 @@ ingress:
# - secretName: chart-example-tls
# hosts:
# - passbook.k8s.local
# These settings configure the packaged PostgreSQL and Redis chart.
postgresql:
postgresqlDatabase: passbook
redis:
cluster:
enabled: false
master:
persistence:
enabled: false