| |
@@ -0,0 +1,57 @@
|
| |
+ apiVersion: monitoring.coreos.com/v1
|
| |
+ kind: PrometheusRule
|
| |
+ metadata:
|
| |
+ name: alerts
|
| |
+ spec:
|
| |
+ groups:
|
| |
+ - name: jobFailed
|
| |
+ rules:
|
| |
+ - alert: JobFailed
|
| |
+ annotations:
|
| |
+ description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
|
| |
+ summary: At least one job has failed.
|
| |
+ expr: kube_job_failed > 0
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
+ - name: BuildFailed
|
| |
+ rules:
|
| |
+ - alert: BuildFailed
|
| |
+ annotations:
|
| |
+ description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
|
| |
+ summary: Build {{$labels.buildconfig}} has failed.
|
| |
+ expr: openshift_build_status_phase_total{build_phase="failed"} > 0
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
+ - name: PodFailing
|
| |
+ rules:
|
| |
+ - alert: PodPending
|
| |
+ annotations:
|
| |
+ description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
| |
+ summary: Pod {{$labels.pod}} is in pending state.
|
| |
+ expr: kube_pod_status_phase{phase="Pending"} > 0
|
| |
+ for: 10m
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
+ - alert: PodRestarted
|
| |
+ annotations:
|
| |
+ description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted.
|
| |
+ summary: Containers in pod {{$labels.pod}} has restarted.
|
| |
+ expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
+ - alert: PodCrashLoop
|
| |
+ annotations:
|
| |
+ description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes.
|
| |
+ summary: Pod {{$labels.pod}} is in CrashLoop state.
|
| |
+ expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
+ for: 15m
|
| |
+ - alert: PodOOMKilled
|
| |
+ annotations:
|
| |
+ description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out
|
| |
+ of memory and has been killed.
|
| |
+ summary: Containers in pod {{$labels.pod}} has been OOMKilled.
|
| |
+ expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
| |
+ labels:
|
| |
+ severity: warning
|
| |
This PR set up an alertmanager email receiver (AlertmanagerConfig) in user namespaces using the appowners' fedoraproject.org emails.
It also adds a few Prometheus rules for the websites namespace to detect failed pods, jobs, and builds.
There are 2 requirements needed for this to work:
cluster-monitoring-config
config map inopenshift-monitoring
namespace:Related: https://pagure.io/fedora-infrastructure/issue/10671