From f4680f2072a0ac3d09caab9e6bfc895ef940cf37 Mon Sep 17 00:00:00 2001
From: irl <iain@learmonth.me>
Date: Tue, 12 May 2026 19:51:27 +0100
Subject: [PATCH] feat: add alertmanager and grafana

---
 .../files/home/podman/alert.rules.yml         | 302 ++++++++++++++++++
 roles/podman_prometheus/handlers/main.yml     |  25 ++
 roles/podman_prometheus/tasks/main.yml        |  53 ++-
 .../templates/home/podman/alertmanager.yml    |   1 +
 .../containers/systemd/alertmanager.container |  11 +
 .../containers/systemd/grafana.container      |  14 +
 .../config/containers/systemd/monitor.network |   2 +
 .../containers/systemd/prometheus.container   |   5 +-
 .../templates/home/podman/nginx.conf          |  10 +-
 .../templates/home/podman/prometheus.yml      |  16 +
 10 files changed, 428 insertions(+), 11 deletions(-)
 create mode 100644 roles/podman_prometheus/files/home/podman/alert.rules.yml
 create mode 100644 roles/podman_prometheus/templates/home/podman/alertmanager.yml
 create mode 100644 roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
 create mode 100644 roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
 create mode 100644 roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network

diff --git a/roles/podman_prometheus/files/home/podman/alert.rules.yml b/roles/podman_prometheus/files/home/podman/alert.rules.yml
new file mode 100644
index 0000000..381af39
--- /dev/null
+++ b/roles/podman_prometheus/files/home/podman/alert.rules.yml
@@ -0,0 +1,302 @@
+groups:
+- name: node_exporter_alerts
+  rules:
+  - alert: Node down
+    expr: up == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      title: Node {{ $labels.instance }} is down
+      description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
+
+  - alert: HostOutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of memory (instance {{ $labels.instance }})
+      description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
+
+  - alert: HostMemoryUnderMemoryPressure
+    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host memory under memory pressure (instance {{ $labels.instance }})
+      description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualNetworkThroughputIn
+    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput in (instance {{ $labels.instance }})
+      description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualNetworkThroughputOut
+    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput out (instance {{ $labels.instance }})
+      description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskReadRate
+    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read rate (instance {{ $labels.instance }})
+      description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+  # TODO: Debug and reduce limit to 50
+  - alert: HostUnusualDiskWriteRate
+    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 65
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write rate (instance {{ $labels.instance }})
+      description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of disk space (instance {{ $labels.instance }})
+      description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
+
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostDiskWillFillIn24Hours
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+      description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+  - alert: HostOutOfInodes
+    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of inodes (instance {{ $labels.instance }})
+      description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
+
+  - alert: HostInodesWillFillIn24Hours
+    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+      description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskReadLatency
+    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read latency (instance {{ $labels.instance }})
+      description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskWriteLatency
+    expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write latency (instance {{ $labels.instance }})
+      description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
+
+  - alert: HostHighCpuLoad
+    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high CPU load (instance {{ $labels.instance }})
+      description: CPU load is > 80%\n  VALUE = {{ $value }}
+
+  - alert: HostCpuStealNoisyNeighbor
+    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+      description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
+
+#  TODO: Increase size of monitor instance
+#  # 1000 context switches is an arbitrary number.
+#  # Alert threshold depends on nature of application.
+#  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+#  - alert: HostContextSwitching
+#    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
+#    for: 0m
+#    labels:
+#      severity: warning
+#    annotations:
+#      summary: Host context switching (instance {{ $labels.instance }})
+#      description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
+
+  - alert: HostSwapIsFillingUp
+    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host swap is filling up (instance {{ $labels.instance }})
+      description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
+
+  - alert: HostSystemdServiceCrashed
+    expr: node_systemd_unit_state{state="failed"} == 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host SystemD service crashed (instance {{ $labels.instance }})
+      description: SystemD service crashed\n  VALUE = {{ $value }}
+
+  - alert: HostPhysicalComponentTooHot
+    expr: node_hwmon_temp_celsius > 75
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host physical component too hot (instance {{ $labels.instance }})
+      description: Physical hardware component too hot\n  VALUE = {{ $value }}
+
+  - alert: HostNodeOvertemperatureAlarm
+    expr: node_hwmon_temp_crit_alarm_celsius == 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+      description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
+
+  - alert: HostRaidArrayGotInactive
+    expr: node_md_state{state="inactive"} > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host RAID array got inactive (instance {{ $labels.instance }})
+      description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
+
+  - alert: HostRaidDiskFailure
+    expr: node_md_disks{state="failed"} > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host RAID disk failure (instance {{ $labels.instance }})
+      description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
+
+#  TODO: We have mix of Debian/Rocky/Alma systems
+#  - alert: HostKernelVersionDeviations
+#    expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+#    for: 6h
+#    labels:
+#      severity: warning
+#    annotations:
+#      summary: Host kernel version deviations (instance {{ $labels.instance }})
+#      description: Different kernel versions are running\n  VALUE = {{ $value }}
+
+  - alert: HostOomKillDetected
+    expr: increase(node_vmstat_oom_kill[1m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host OOM kill detected (instance {{ $labels.instance }})
+      description: OOM kill detected\n  VALUE = {{ $value }}
+
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: increase(node_edac_correctable_errors_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+      description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: node_edac_uncorrectable_errors_total > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+      description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkReceiveErrors
+    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+      description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkTransmitErrors
+    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+      description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkInterfaceSaturated
+    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
+      description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
+
+  - alert: HostConntrackLimit
+    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack limit (instance {{ $labels.instance }})
+      description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
+
+  - alert: HostClockSkew
+    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock skew (instance {{ $labels.instance }})
+      description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
+
+  - alert: HostClockNotSynchronising
+    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock not synchronising (instance {{ $labels.instance }})
+      description: Clock not synchronising.\n  VALUE = {{ $value }}
diff --git a/roles/podman_prometheus/handlers/main.yml b/roles/podman_prometheus/handlers/main.yml
index 7165847..91c6ade 100644
--- a/roles/podman_prometheus/handlers/main.yml
+++ b/roles/podman_prometheus/handlers/main.yml
@@ -1,4 +1,20 @@
 ---
+- name: Restart Alertmanager
+  ansible.builtin.systemd_service:
+    name: grafana
+    scope: user
+    state: restarted
+  become: true
+  become_user: "{{ podman_prometheus_podman_rootless_user }}"
+
+- name: Restart Grafana
+  ansible.builtin.systemd_service:
+    name: grafana
+    scope: user
+    state: restarted
+  become: true
+  become_user: "{{ podman_prometheus_podman_rootless_user }}"
+
 - name: Restart Prometheus
   ansible.builtin.systemd_service:
     name: prometheus
@@ -6,3 +22,12 @@
     state: restarted
   become: true
   become_user: "{{ podman_prometheus_podman_rootless_user }}"
+
+- name: Restart nginx
+  ansible.builtin.systemd_service:
+    name: nginx
+    state: restarted
+    scope: user
+    daemon_reload: true
+  become: true
+  become_user: "{{ podman_prometheus_podman_rootless_user }}"
diff --git a/roles/podman_prometheus/tasks/main.yml b/roles/podman_prometheus/tasks/main.yml
index 962b535..d3c60e1 100644
--- a/roles/podman_prometheus/tasks/main.yml
+++ b/roles/podman_prometheus/tasks/main.yml
@@ -55,14 +55,51 @@
 # Prometheus runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Prometheus configuration
   ansible.builtin.template:
-    src: home/podman/prometheus.yml
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml"
+    src: "home/podman/{{ item }}"
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
+    mode: "0400"
+    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
+    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
+  become: true
+  with_items:
+    - prometheus.yml
+  notify:
+    - Restart Prometheus
+
+- name: Podman Prometheus | PATCH | Install Prometheus alert rules
+  ansible.builtin.copy:
+    src: "home/podman/{{ item }}"
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
+    mode: "0400"
+    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
+    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
+  become: true
+  with_items:
+    - alert.rules.yml
+  notify:
+    - Restart Prometheus
+
+# Alertmanager runs with UID/GID 65534 inside the container
+- name: Podman Prometheus | PATCH | Install Alertmanager configuration
+  ansible.builtin.template:
+    src: home/podman/alertmanager.yml
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml"
     mode: "0400"
     owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
     group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
   become: true
   notify:
-    - Restart Prometheus
+    - Restart Alertmanager
+
+# Grafana runs with UID/GID 472 inside the container
+- name: Podman Prometheus | PATCH | Create data directory for Grafana
+  ansible.builtin.file:
+    path: "/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data"
+    owner: "{{ _podman_prometheus_user_subuid_start + 471 }}"
+    group: "{{ _podman_prometheus_user_subgid_start + 471 }}"
+    mode: "0700"
+    state: "directory"
+  become: true
 
 - name: Podman Prometheus | PATCH | Install container quadlets
   ansible.builtin.template:
@@ -71,9 +108,12 @@
     owner: "{{ podman_prometheus_podman_rootless_user }}"
     mode: "0400"
   with_items:
+    - alertmanager.container
+    - grafana.container
     - prometheus.container
   become: true
   notify:
+    - Restart Grafana
     - Restart Prometheus
 
 - name: Podman Prometheus | PATCH | Install network quadlets
@@ -84,8 +124,11 @@
     mode: "0400"
   with_items:
     - frontend.network
+    - monitor.network
   become: true
   notify:
+    - Restart Alertmanager
+    - Restart Grafana
     - Restart Prometheus
     - Restart nginx
 
@@ -122,7 +165,7 @@
   notify:
     - Restart nginx
 
-- name: Podman Prometheus | PATCH | Make sure Prometheus and Nginx are running now and started on boot
+- name: Podman Prometheus | PATCH | Make sure Prometheus, Grafana and Nginx are running now and started on boot
   ansible.builtin.systemd_service:
     name: "{{ item }}.service"
     enabled: true
@@ -131,6 +174,8 @@
     daemon_reload: true
     scope: user
   with_items:
+    - alertmanager
+    - grafana
     - nginx
     - prometheus
   become: true
diff --git a/roles/podman_prometheus/templates/home/podman/alertmanager.yml b/roles/podman_prometheus/templates/home/podman/alertmanager.yml
new file mode 100644
index 0000000..cd7aa1d
--- /dev/null
+++ b/roles/podman_prometheus/templates/home/podman/alertmanager.yml
@@ -0,0 +1 @@
+{{ podman_prometheus_alertmanager_config | to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}
diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
new file mode 100644
index 0000000..d9fb543
--- /dev/null
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
@@ -0,0 +1,11 @@
+[Container]
+ContainerName=alertmanager
+Image=quay.io/prometheus/alertmanager:v0.31.1
+Network=monitor.network
+Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,Z
+
+[Service]
+Restart=on-failure
+
+[Install]
+WantedBy=default.target
diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
new file mode 100644
index 0000000..3a83bfe
--- /dev/null
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
@@ -0,0 +1,14 @@
+[Container]
+ContainerName=grafana
+Image=docker.io/grafana/grafana
+Environment=GF_SERVER_DOMAIN={{ inventory_hostname }}
+Environment=GF_SERVER_ROOT_URL=https://%%(domain)s/
+Network=frontend.network
+Network=monitor.network
+Volume=/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data:/var/lib/grafana:rw,Z
+
+[Service]
+Restart=on-failure
+
+[Install]
+WantedBy=default.target
diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
new file mode 100644
index 0000000..4029eb4
--- /dev/null
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
@@ -0,0 +1,2 @@
+[Network]
+NetworkName=network
diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
index b10b545..f5873f1 100644
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
@@ -1,10 +1,11 @@
 [Container]
 ContainerName=prometheus
-Image=quay.io/prometheus/prometheus:v3.8.1
-Network=frontend.network
+Image=quay.io/prometheus/prometheus:v3.9.1
+Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus-data:/prometheus:rw,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/file-configs:/file-configs:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z
+Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alert.rules.yml:/etc/alertmanager/alert.rules.yml:ro,Z
 
 [Service]
 Restart=on-failure
diff --git a/roles/podman_prometheus/templates/home/podman/nginx.conf b/roles/podman_prometheus/templates/home/podman/nginx.conf
index a1a168f..1d1c9ae 100644
--- a/roles/podman_prometheus/templates/home/podman/nginx.conf
+++ b/roles/podman_prometheus/templates/home/podman/nginx.conf
@@ -21,9 +21,9 @@ server {
     }
 }
 
-upstream prometheus {
-    zone prometheus_upstream 64k;
-	server prometheus:9090 resolve;
+upstream grafana {
+    zone grafana_upstream 64k;
+	server grafana:3000 resolve;
 }
 
 server {
@@ -37,7 +37,7 @@ server {
     ssl_certificate_key /etc/letsencrypt/live/{{ inventory_hostname }}/privkey.pem;
 
 	add_header Strict-Transport-Security "max-age=31536000" always;
-	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL to the Prometheus instance
+	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL
     add_header X-Content-Type-Options "nosniff" always;
     add_header X-XSS-Protection "1; mode=block" always;
 
@@ -47,6 +47,6 @@ server {
     proxy_set_header X-Forwarded-Proto $scheme;
 
     proxy_read_timeout 180;
-    proxy_pass http://prometheus;
+    proxy_pass http://grafana;
   }
 }
diff --git a/roles/podman_prometheus/templates/home/podman/prometheus.yml b/roles/podman_prometheus/templates/home/podman/prometheus.yml
index 422646d..4870f78 100644
--- a/roles/podman_prometheus/templates/home/podman/prometheus.yml
+++ b/roles/podman_prometheus/templates/home/podman/prometheus.yml
@@ -8,6 +8,9 @@ scrape_configs:
     scrape_interval: 5s
     static_configs:
       - targets: ['localhost:9090']
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['alertmanager:9093']
   - job_name: 'node'
     scrape_interval: 5s
     scheme: https
@@ -25,9 +28,22 @@ scrape_configs:
 {% for host in groups['keycloak'] %}
         - '{{ host }}:9100'
 {% endfor %}
+{% for host in groups['radius'] %}
+        - '{{ host }}:9100'
+{% endfor %}
 {% for host in groups['generic'] %}
         - '{{ host }}:9100'
 {% endfor %}
     file_sd_configs:
       - files:
           - "/file-configs/*.yml"
+
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets:
+      - "alertmanager:9093"
+
+rule_files:
+  - "/etc/alertmanager/alert.rules.yml"