helm(major): add prometheus rules, add switch to enable/disable monitoring
This commit is contained in:
parent
3d73aac3ab
commit
2c0f0a68a8
10
hack/prometheus/grafana.helm.yaml
Normal file
10
hack/prometheus/grafana.helm.yaml
Normal file
|
@ -0,0 +1,10 @@
|
|||
ingress:
|
||||
enabled: true
|
||||
hosts:
|
||||
- some.address.tld
|
||||
|
||||
grafana.ini:
|
||||
auth.anonymous:
|
||||
enabled: true
|
||||
org_name: Main Org.
|
||||
org_role: Viewer
|
63
hack/prometheus/instance.yaml
Normal file
63
hack/prometheus/instance.yaml
Normal file
|
@ -0,0 +1,63 @@
|
|||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus
|
||||
namespace: prod-passbook-ng
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Prometheus
|
||||
metadata:
|
||||
name: prometheus
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: passbook
|
||||
enableAdminAPI: false
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: passbook
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
prometheus: k8s
|
||||
name: prometheus-storage
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 15Gi
|
124
helm/passbook/templates/prom-rules.yaml
Normal file
124
helm/passbook/templates/prom-rules.yaml
Normal file
|
@ -0,0 +1,124 @@
|
|||
{{- if .Values.monitoring.enabled -}}
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ include "passbook.fullname" . }}-static-rules
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ include "passbook.name" . }}
|
||||
helm.sh/chart: {{ include "passbook.chart" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
spec:
|
||||
groups:
|
||||
- name: Aggregate request counters
|
||||
rules:
|
||||
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) by (job)
|
||||
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) by (job)
|
||||
- record: job:django_http_ajax_requests_total:sum_rate30s
|
||||
expr: sum(rate(django_http_ajax_requests_total[30s])) by (job)
|
||||
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) by (job)
|
||||
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) by (job)
|
||||
- record: job:django_http_requests_body_total_bytes:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_body_total_bytes[30s])) by (job)
|
||||
- record: job:django_http_responses_streaming_total:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_streaming_total[30s])) by (job)
|
||||
- record: job:django_http_responses_body_total_bytes:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_body_total_bytes[30s])) by (job)
|
||||
- record: job:django_http_requests_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job)
|
||||
- record: job:django_http_requests_total_by_method:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job,method)
|
||||
- record: job:django_http_requests_total_by_transport:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_transport[30s])) by (job,transport)
|
||||
- record: job:django_http_requests_total_by_view:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view)
|
||||
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view,transport,method)
|
||||
- record: job:django_http_responses_total_by_templatename:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_templatename[30s])) by (job,templatename)
|
||||
- record: job:django_http_responses_total_by_status:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_status[30s])) by (job,status)
|
||||
- record: job:django_http_responses_total_by_status_name_method:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_status_name_method[30s])) by (job,status,name,method)
|
||||
- record: job:django_http_responses_total_by_charset:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_charset[30s])) by (job,charset)
|
||||
- record: job:django_http_exceptions_total_by_type:sum_rate30s
|
||||
expr: sum(rate(django_http_exceptions_total_by_type[30s])) by (job,type)
|
||||
- record: job:django_http_exceptions_total_by_view:sum_rate30s
|
||||
expr: sum(rate(django_http_exceptions_total_by_view[30s])) by (job,view)
|
||||
- name: Aggregate latency histograms
|
||||
rules:
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "50"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "95"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "99"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "99.9"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "50"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "95"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "99"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
|
||||
labels:
|
||||
quantile: "99.9"
|
||||
- name: Aggregate model operations
|
||||
rules:
|
||||
- record: job:django_model_inserts_total:sum_rate1m
|
||||
expr: sum(rate(django_model_inserts_total[1m])) by (job, model)
|
||||
- record: job:django_model_updates_total:sum_rate1m
|
||||
expr: sum(rate(django_model_updates_total[1m])) by (job, model)
|
||||
- record: job:django_model_deletes_total:sum_rate1m
|
||||
expr: sum(rate(django_model_deletes_total[1m])) by (job, model)
|
||||
- name: Aggregate database operations
|
||||
rules:
|
||||
- record: job:django_db_new_connections_total:sum_rate30s
|
||||
expr: sum(rate(django_db_new_connections_total[30s])) by (alias, vendor)
|
||||
- record: job:django_db_new_connection_errors_total:sum_rate30s
|
||||
expr: sum(rate(django_db_new_connection_errors_total[30s])) by (alias, vendor)
|
||||
- record: job:django_db_execute_total:sum_rate30s
|
||||
expr: sum(rate(django_db_execute_total[30s])) by (alias, vendor)
|
||||
- record: job:django_db_execute_many_total:sum_rate30s
|
||||
expr: sum(rate(django_db_execute_many_total[30s])) by (alias, vendor)
|
||||
- record: job:django_db_errors_total:sum_rate30s
|
||||
expr: sum(rate(django_db_errors_total[30s])) by (alias, vendor, type)
|
||||
- name: Aggregate migrations
|
||||
rules:
|
||||
- record: job:django_migrations_applied_total:max
|
||||
expr: max(django_migrations_applied_total) by (job, connection)
|
||||
- record: job:django_migrations_unapplied_total:max
|
||||
expr: max(django_migrations_unapplied_total) by (job, connection)
|
||||
- name: Alerts
|
||||
rules:
|
||||
- alert: UnappliedMigrations
|
||||
expr: job:django_migrations_unapplied_total:max > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: testing
|
||||
annotations:
|
||||
summary: "Unapplied django migrations on {{$labels.connection}}"
|
||||
description: "Django detected {{$value}} unapplied migrations on database {{$labels.connection}}"
|
||||
{{- end }}
|
|
@ -1,3 +1,4 @@
|
|||
{{- if .Values.monitoring.enabled -}}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
|
@ -13,3 +14,4 @@ spec:
|
|||
selector:
|
||||
matchLabels:
|
||||
k8s.passbook.io/component: static
|
||||
{{- end }}
|
||||
|
|
|
@ -9,9 +9,9 @@ metadata:
|
|||
helm.sh/chart: {{ include "passbook.chart" . }}
|
||||
k8s.passbook.io/component: web
|
||||
spec:
|
||||
type: {{ .Values.service.type }}
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: {{ .Values.service.port }}
|
||||
- port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
{{- if .Values.monitoring.enabled -}}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
|
@ -17,6 +18,8 @@ spec:
|
|||
name: {{ include "passbook.fullname" . }}-secret-key
|
||||
key: monitoring_username
|
||||
port: http
|
||||
interval: 10s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s.passbook.io/component: web
|
||||
{{- end }}
|
||||
|
|
|
@ -14,19 +14,10 @@ config:
|
|||
email:
|
||||
host: localhost
|
||||
|
||||
postgresql:
|
||||
postgresqlDatabase: passbook
|
||||
|
||||
redis:
|
||||
cluster:
|
||||
enabled: false
|
||||
master:
|
||||
persistence:
|
||||
enabled: false
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 80
|
||||
# This Helm chart ships with built-in Prometheus ServiceMonitors and Rules.
|
||||
# This requires the CoreOS Prometheus Operator.
|
||||
monitoring:
|
||||
enabled: false
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
@ -40,3 +31,14 @@ ingress:
|
|||
# - secretName: chart-example-tls
|
||||
# hosts:
|
||||
# - passbook.k8s.local
|
||||
|
||||
# These settings configure the packaged PostgreSQL and Redis chart.
|
||||
postgresql:
|
||||
postgresqlDatabase: passbook
|
||||
|
||||
redis:
|
||||
cluster:
|
||||
enabled: false
|
||||
master:
|
||||
persistence:
|
||||
enabled: false
|
||||
|
|
Reference in a new issue