feat: add alertmanager and grafana

2026-05-12 19:51:27 +01:00 · 2026-05-12 19:51:27 +01:00 · f4680f2072
commit f4680f2072
parent 4816cce305
10 changed files with 428 additions and 11 deletions
--- a/roles/podman_prometheus/files/home/podman/alert.rules.yml
+++ b/roles/podman_prometheus/files/home/podman/alert.rules.yml
@ -0,0 +1,302 @@
 groups:
 - name: node_exporter_alerts
  rules:
  - alert: Node down
    expr: up == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      title: Node {{ $labels.instance }} is down
      description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of memory (instance {{ $labels.instance }})
      description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host memory under memory pressure (instance {{ $labels.instance }})
      description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput in (instance {{ $labels.instance }})
      description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput out (instance {{ $labels.instance }})
      description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskReadRate
    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read rate (instance {{ $labels.instance }})
      description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
  # TODO: Debug and reduce limit to 50
  - alert: HostUnusualDiskWriteRate
    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 65
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write rate (instance {{ $labels.instance }})
      description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of disk space (instance {{ $labels.instance }})
      description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostDiskWillFillIn24Hours
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
      description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of inodes (instance {{ $labels.instance }})
      description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
  - alert: HostInodesWillFillIn24Hours
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
      description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read latency (instance {{ $labels.instance }})
      description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write latency (instance {{ $labels.instance }})
      description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host high CPU load (instance {{ $labels.instance }})
      description: CPU load is > 80%\n  VALUE = {{ $value }}
  - alert: HostCpuStealNoisyNeighbor
    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
      description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
 #  TODO: Increase size of monitor instance
 #  # 1000 context switches is an arbitrary number.
 #  # Alert threshold depends on nature of application.
 #  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
 #  - alert: HostContextSwitching
 #    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
 #    for: 0m
 #    labels:
 #      severity: warning
 #    annotations:
 #      summary: Host context switching (instance {{ $labels.instance }})
 #      description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host swap is filling up (instance {{ $labels.instance }})
      description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host SystemD service crashed (instance {{ $labels.instance }})
      description: SystemD service crashed\n  VALUE = {{ $value }}
  - alert: HostPhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 75
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host physical component too hot (instance {{ $labels.instance }})
      description: Physical hardware component too hot\n  VALUE = {{ $value }}
  - alert: HostNodeOvertemperatureAlarm
    expr: node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
      description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
  - alert: HostRaidArrayGotInactive
    expr: node_md_state{state="inactive"} > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host RAID array got inactive (instance {{ $labels.instance }})
      description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="failed"} > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host RAID disk failure (instance {{ $labels.instance }})
      description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
 #  TODO: We have mix of Debian/Rocky/Alma systems
 #  - alert: HostKernelVersionDeviations
 #    expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
 #    for: 6h
 #    labels:
 #      severity: warning
 #    annotations:
 #      summary: Host kernel version deviations (instance {{ $labels.instance }})
 #      description: Different kernel versions are running\n  VALUE = {{ $value }}
  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host OOM kill detected (instance {{ $labels.instance }})
      description: OOM kill detected\n  VALUE = {{ $value }}
  - alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: info
    annotations:
      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
      description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
      description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkReceiveErrors
    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
      description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkTransmitErrors
    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
      description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkInterfaceSaturated
    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
      description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
  - alert: HostConntrackLimit
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host conntrack limit (instance {{ $labels.instance }})
      description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
  - alert: HostClockSkew
    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock skew (instance {{ $labels.instance }})
      description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
  - alert: HostClockNotSynchronising
    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock not synchronising (instance {{ $labels.instance }})
      description: Clock not synchronising.\n  VALUE = {{ $value }}
--- a/roles/podman_prometheus/handlers/main.yml
+++ b/roles/podman_prometheus/handlers/main.yml
@ -1,4 +1,20 @@
 ---
 - name: Restart Alertmanager
  ansible.builtin.systemd_service:
    name: grafana
    scope: user
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart Grafana
  ansible.builtin.systemd_service:
    name: grafana
    scope: user
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart Prometheus
  ansible.builtin.systemd_service:
    name: prometheus
@ -6,3 +22,12 @@
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart nginx
  ansible.builtin.systemd_service:
    name: nginx
    state: restarted
    scope: user
    daemon_reload: true
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
--- a/roles/podman_prometheus/tasks/main.yml
+++ b/roles/podman_prometheus/tasks/main.yml
@ -55,14 +55,51 @@
 # Prometheus runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Prometheus configuration
  ansible.builtin.template:
-    src: home/podman/prometheus.yml
+    src: "home/podman/{{ item }}"
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml"
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  with_items:
    - prometheus.yml
  notify:
    - Restart Prometheus
 - name: Podman Prometheus | PATCH | Install Prometheus alert rules
  ansible.builtin.copy:
    src: "home/podman/{{ item }}"
    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  with_items:
    - alert.rules.yml
  notify:
    - Restart Prometheus
 # Alertmanager runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Alertmanager configuration
  ansible.builtin.template:
    src: home/podman/alertmanager.yml
    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  notify:
-    - Restart Prometheus
+    - Restart Alertmanager
 # Grafana runs with UID/GID 472 inside the container
 - name: Podman Prometheus | PATCH | Create data directory for Grafana
  ansible.builtin.file:
    path: "/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data"
    owner: "{{ _podman_prometheus_user_subuid_start + 471 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 471 }}"
    mode: "0700"
    state: "directory"
  become: true
 - name: Podman Prometheus | PATCH | Install container quadlets
  ansible.builtin.template:
@ -71,9 +108,12 @@
    owner: "{{ podman_prometheus_podman_rootless_user }}"
    mode: "0400"
  with_items:
    - alertmanager.container
    - grafana.container
    - prometheus.container
  become: true
  notify:
    - Restart Grafana
    - Restart Prometheus
 - name: Podman Prometheus | PATCH | Install network quadlets
@ -84,8 +124,11 @@
    mode: "0400"
  with_items:
    - frontend.network
    - monitor.network
  become: true
  notify:
    - Restart Alertmanager
    - Restart Grafana
    - Restart Prometheus
    - Restart nginx
@ -122,7 +165,7 @@
  notify:
    - Restart nginx
- name: Podman Prometheus | PATCH | Make sure Prometheus and Nginx are running now and started on boot
+- name: Podman Prometheus | PATCH | Make sure Prometheus, Grafana and Nginx are running now and started on boot
  ansible.builtin.systemd_service:
    name: "{{ item }}.service"
    enabled: true
@ -131,6 +174,8 @@
    daemon_reload: true
    scope: user
  with_items:
    - alertmanager
    - grafana
    - nginx
    - prometheus
  become: true
--- a/roles/podman_prometheus/templates/home/podman/alertmanager.yml
+++ b/roles/podman_prometheus/templates/home/podman/alertmanager.yml
@ -0,0 +1 @@
 {{ podman_prometheus_alertmanager_config | to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
@ -0,0 +1,11 @@
 [Container]
 ContainerName=alertmanager
 Image=quay.io/prometheus/alertmanager:v0.31.1
 Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,Z
 [Service]
 Restart=on-failure
 [Install]
 WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
@ -0,0 +1,14 @@
 [Container]
 ContainerName=grafana
 Image=docker.io/grafana/grafana
 Environment=GF_SERVER_DOMAIN={{ inventory_hostname }}
 Environment=GF_SERVER_ROOT_URL=https://%%(domain)s/
 Network=frontend.network
 Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data:/var/lib/grafana:rw,Z
 [Service]
 Restart=on-failure
 [Install]
 WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
@ -0,0 +1,2 @@
 [Network]
 NetworkName=network
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
@ -1,10 +1,11 @@
 [Container]
 ContainerName=prometheus
-Image=quay.io/prometheus/prometheus:v3.8.1
+Image=quay.io/prometheus/prometheus:v3.9.1
-Network=frontend.network
+Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus-data:/prometheus:rw,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/file-configs:/file-configs:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alert.rules.yml:/etc/alertmanager/alert.rules.yml:ro,Z
 [Service]
 Restart=on-failure
--- a/roles/podman_prometheus/templates/home/podman/nginx.conf
+++ b/roles/podman_prometheus/templates/home/podman/nginx.conf
@ -21,9 +21,9 @@ server {
    }
 }
-upstream prometheus {
+upstream grafana {
-    zone prometheus_upstream 64k;
+    zone grafana_upstream 64k;
-	server prometheus:9090 resolve;
+	server grafana:3000 resolve;
 }
 server {
@ -37,7 +37,7 @@ server {
    ssl_certificate_key /etc/letsencrypt/live/{{ inventory_hostname }}/privkey.pem;
 	add_header Strict-Transport-Security "max-age=31536000" always;
-	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL to the Prometheus instance
+	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL
    add_header X-Content-Type-Options "nosniff" always;
    add_header X-XSS-Protection "1; mode=block" always;
@ -47,6 +47,6 @@ server {
    proxy_set_header X-Forwarded-Proto $scheme;
    proxy_read_timeout 180;
-    proxy_pass http://prometheus;
+    proxy_pass http://grafana;
  }
 }
--- a/roles/podman_prometheus/templates/home/podman/prometheus.yml
+++ b/roles/podman_prometheus/templates/home/podman/prometheus.yml
@ -8,6 +8,9 @@ scrape_configs:
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9090']
  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']
  - job_name: 'node'
    scrape_interval: 5s
    scheme: https
@ -25,9 +28,22 @@ scrape_configs:
 {% for host in groups['keycloak'] %}
        - '{{ host }}:9100'
 {% endfor %}
 {% for host in groups['radius'] %}
        - '{{ host }}:9100'
 {% endfor %}
 {% for host in groups['generic'] %}
        - '{{ host }}:9100'
 {% endfor %}
    file_sd_configs:
      - files:
          - "/file-configs/*.yml"
 alerting:
  alertmanagers:
  - scheme: http
    static_configs:
    - targets:
      - "alertmanager:9093"
 rule_files:
  - "/etc/alertmanager/alert.rules.yml"
		`@ -0,0 +1 @@`
							`{{ podman_prometheus_alertmanager_config \| to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}`