23 changed files with 12 additions and 935 deletions
--- a/playbooks/services.yml
+++ b/playbooks/services.yml
@ -23,6 +23,8 @@
    rhel9cis_rule_5_4_3_3: false
  roles:
    - role: sr2c.core.baseline
+      baseline_epel_packages_allowed:
+        - node-exporter
      tags: bootstrap
    - role: sr2c.core.freeipa
      become: true
@ -75,31 +77,3 @@
      tags: bootstrap
    - role: sr2c.core.node_exporter
      tags: prometheus
-
- name: Deploy and update Radius server
-  hosts:
-    - radius
-  roles:
-    - role: sr2c.core.baseline
-      vars:
-        baseline_epel_packages_allowed:
-          - certbot
-          - python3-certbot
-          - python3-pyrfc3339
-          - python3-parsedatetime
-          - python3-josepy
-          - python3-importlib-metadata
-          - python3-configargparse
-          - python3-acme
-          - python3-zipp
-          - python3-pyOpenSSL
-          - node-exporter
-      tags: bootstrap
-    - role: freeipa.ansible_freeipa.ipaclient
-      become: true
-      state: present
-      tags: bootstrap
-    - role: sr2c.core.node_exporter
-      tags: prometheus
-    - role: sr2c.core.radius
-      tags: radius
--- a/roles/podman_prometheus/files/home/podman/alert.rules.yml
+++ b/roles/podman_prometheus/files/home/podman/alert.rules.yml
@ -1,302 +0,0 @@
-groups:
- name: node_exporter_alerts
-  rules:
-  - alert: Node down
-    expr: up == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      title: Node {{ $labels.instance }} is down
-      description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
-
-  - alert: HostOutOfMemory
-    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of memory (instance {{ $labels.instance }})
-      description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
-
-  - alert: HostMemoryUnderMemoryPressure
-    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host memory under memory pressure (instance {{ $labels.instance }})
-      description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
-
-  - alert: HostUnusualNetworkThroughputIn
-    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput in (instance {{ $labels.instance }})
-      description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
-
-  - alert: HostUnusualNetworkThroughputOut
-    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput out (instance {{ $labels.instance }})
-      description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
-
-  - alert: HostUnusualDiskReadRate
-    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk read rate (instance {{ $labels.instance }})
-      description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
-
-  # TODO: Debug and reduce limit to 50
-  - alert: HostUnusualDiskWriteRate
-    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 65
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk write rate (instance {{ $labels.instance }})
-      description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
-
-  # Please add ignored mountpoints in node_exporter parameters like
-  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-  - alert: HostOutOfDiskSpace
-    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of disk space (instance {{ $labels.instance }})
-      description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
-
-  # Please add ignored mountpoints in node_exporter parameters like
-  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-  - alert: HostDiskWillFillIn24Hours
-    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
-      description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
-
-  - alert: HostOutOfInodes
-    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of inodes (instance {{ $labels.instance }})
-      description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
-
-  - alert: HostInodesWillFillIn24Hours
-    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-      description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
-
-  - alert: HostUnusualDiskReadLatency
-    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk read latency (instance {{ $labels.instance }})
-      description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
-
-  - alert: HostUnusualDiskWriteLatency
-    expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk write latency (instance {{ $labels.instance }})
-      description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
-
-  - alert: HostHighCpuLoad
-    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host high CPU load (instance {{ $labels.instance }})
-      description: CPU load is > 80%\n  VALUE = {{ $value }}
-
-  - alert: HostCpuStealNoisyNeighbor
-    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
-      description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
-
-#  TODO: Increase size of monitor instance
-#  # 1000 context switches is an arbitrary number.
-#  # Alert threshold depends on nature of application.
-#  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
-#  - alert: HostContextSwitching
-#    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
-#    for: 0m
-#    labels:
-#      severity: warning
-#    annotations:
-#      summary: Host context switching (instance {{ $labels.instance }})
-#      description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
-
-  - alert: HostSwapIsFillingUp
-    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host swap is filling up (instance {{ $labels.instance }})
-      description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
-
-  - alert: HostSystemdServiceCrashed
-    expr: node_systemd_unit_state{state="failed"} == 1
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host SystemD service crashed (instance {{ $labels.instance }})
-      description: SystemD service crashed\n  VALUE = {{ $value }}
-
-  - alert: HostPhysicalComponentTooHot
-    expr: node_hwmon_temp_celsius > 75
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host physical component too hot (instance {{ $labels.instance }})
-      description: Physical hardware component too hot\n  VALUE = {{ $value }}
-
-  - alert: HostNodeOvertemperatureAlarm
-    expr: node_hwmon_temp_crit_alarm_celsius == 1
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
-      description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
-
-  - alert: HostRaidArrayGotInactive
-    expr: node_md_state{state="inactive"} > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host RAID array got inactive (instance {{ $labels.instance }})
-      description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
-
-  - alert: HostRaidDiskFailure
-    expr: node_md_disks{state="failed"} > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host RAID disk failure (instance {{ $labels.instance }})
-      description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
-
-#  TODO: We have mix of Debian/Rocky/Alma systems
-#  - alert: HostKernelVersionDeviations
-#    expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
-#    for: 6h
-#    labels:
-#      severity: warning
-#    annotations:
-#      summary: Host kernel version deviations (instance {{ $labels.instance }})
-#      description: Different kernel versions are running\n  VALUE = {{ $value }}
-
-  - alert: HostOomKillDetected
-    expr: increase(node_vmstat_oom_kill[1m]) > 0
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host OOM kill detected (instance {{ $labels.instance }})
-      description: OOM kill detected\n  VALUE = {{ $value }}
-
-  - alert: HostEdacCorrectableErrorsDetected
-    expr: increase(node_edac_correctable_errors_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: info
-    annotations:
-      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
-      description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
-
-  - alert: HostEdacUncorrectableErrorsDetected
-    expr: node_edac_uncorrectable_errors_total > 0
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
-      description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
-
-  - alert: HostNetworkReceiveErrors
-    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
-      description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
-
-  - alert: HostNetworkTransmitErrors
-    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
-      description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
-
-  - alert: HostNetworkInterfaceSaturated
-    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
-      description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
-
-  - alert: HostConntrackLimit
-    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host conntrack limit (instance {{ $labels.instance }})
-      description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
-
-  - alert: HostClockSkew
-    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host clock skew (instance {{ $labels.instance }})
-      description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
-
-  - alert: HostClockNotSynchronising
-    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host clock not synchronising (instance {{ $labels.instance }})
-      description: Clock not synchronising.\n  VALUE = {{ $value }}
--- a/roles/podman_prometheus/handlers/main.yml
+++ b/roles/podman_prometheus/handlers/main.yml
@ -1,20 +1,4 @@
 ---
- name: Restart Alertmanager
-  ansible.builtin.systemd_service:
-    name: grafana
-    scope: user
-    state: restarted
-  become: true
-  become_user: "{{ podman_prometheus_podman_rootless_user }}"
-
- name: Restart Grafana
-  ansible.builtin.systemd_service:
-    name: grafana
-    scope: user
-    state: restarted
-  become: true
-  become_user: "{{ podman_prometheus_podman_rootless_user }}"
-
 - name: Restart Prometheus
  ansible.builtin.systemd_service:
    name: prometheus
@ -22,12 +6,3 @@
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
-
- name: Restart nginx
-  ansible.builtin.systemd_service:
-    name: nginx
-    state: restarted
-    scope: user
-    daemon_reload: true
-  become: true
-  become_user: "{{ podman_prometheus_podman_rootless_user }}"
--- a/roles/podman_prometheus/tasks/main.yml
+++ b/roles/podman_prometheus/tasks/main.yml
@ -55,52 +55,15 @@
 # Prometheus runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Prometheus configuration
  ansible.builtin.template:
-    src: "home/podman/{{ item }}"
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
+    src: home/podman/prometheus.yml
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
-  with_items:
-    - prometheus.yml
  notify:
    - Restart Prometheus

- name: Podman Prometheus | PATCH | Install Prometheus alert rules
-  ansible.builtin.copy:
-    src: "home/podman/{{ item }}"
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
-    mode: "0400"
-    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
-    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
-  become: true
-  with_items:
-    - alert.rules.yml
-  notify:
-    - Restart Prometheus
-
-# Alertmanager runs with UID/GID 65534 inside the container
- name: Podman Prometheus | PATCH | Install Alertmanager configuration
-  ansible.builtin.template:
-    src: home/podman/alertmanager.yml
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml"
-    mode: "0400"
-    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
-    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
-  become: true
-  notify:
-    - Restart Alertmanager
-
-# Grafana runs with UID/GID 472 inside the container
- name: Podman Prometheus | PATCH | Create data directory for Grafana
-  ansible.builtin.file:
-    path: "/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data"
-    owner: "{{ _podman_prometheus_user_subuid_start + 471 }}"
-    group: "{{ _podman_prometheus_user_subgid_start + 471 }}"
-    mode: "0700"
-    state: "directory"
-  become: true
-
 - name: Podman Prometheus | PATCH | Install container quadlets
  ansible.builtin.template:
    src: "home/podman/config/containers/systemd/{{ item }}"
@ -108,12 +71,9 @@
    owner: "{{ podman_prometheus_podman_rootless_user }}"
    mode: "0400"
  with_items:
-    - alertmanager.container
-    - grafana.container
    - prometheus.container
  become: true
  notify:
-    - Restart Grafana
    - Restart Prometheus

 - name: Podman Prometheus | PATCH | Install network quadlets
@ -124,11 +84,8 @@
    mode: "0400"
  with_items:
    - frontend.network
-    - monitor.network
  become: true
  notify:
-    - Restart Alertmanager
-    - Restart Grafana
    - Restart Prometheus
    - Restart nginx

@ -165,7 +122,7 @@
  notify:
    - Restart nginx

- name: Podman Prometheus | PATCH | Make sure Prometheus, Grafana and Nginx are running now and started on boot
+- name: Podman Prometheus | PATCH | Make sure Prometheus and Nginx are running now and started on boot
  ansible.builtin.systemd_service:
    name: "{{ item }}.service"
    enabled: true
@ -174,8 +131,6 @@
    daemon_reload: true
    scope: user
  with_items:
-    - alertmanager
-    - grafana
    - nginx
    - prometheus
  become: true
--- a/roles/podman_prometheus/templates/home/podman/alertmanager.yml
+++ b/roles/podman_prometheus/templates/home/podman/alertmanager.yml
@ -1 +0,0 @@
-{{ podman_prometheus_alertmanager_config | to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
@ -1,11 +0,0 @@
-[Container]
-ContainerName=alertmanager
-Image=quay.io/prometheus/alertmanager:v0.31.1
-Network=monitor.network
-Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,Z
-
-[Service]
-Restart=on-failure
-
-[Install]
-WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
@ -1,14 +0,0 @@
-[Container]
-ContainerName=grafana
-Image=docker.io/grafana/grafana
-Environment=GF_SERVER_DOMAIN={{ inventory_hostname }}
-Environment=GF_SERVER_ROOT_URL=https://%%(domain)s/
-Network=frontend.network
-Network=monitor.network
-Volume=/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data:/var/lib/grafana:rw,Z
-
-[Service]
-Restart=on-failure
-
-[Install]
-WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
@ -1,2 +0,0 @@
-[Network]
-NetworkName=network
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
@ -1,11 +1,10 @@
 [Container]
 ContainerName=prometheus
-Image=quay.io/prometheus/prometheus:v3.9.1
-Network=monitor.network
+Image=quay.io/prometheus/prometheus:v3.8.1
+Network=frontend.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus-data:/prometheus:rw,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/file-configs:/file-configs:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z
-Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alert.rules.yml:/etc/alertmanager/alert.rules.yml:ro,Z

 [Service]
 Restart=on-failure
--- a/roles/podman_prometheus/templates/home/podman/nginx.conf
+++ b/roles/podman_prometheus/templates/home/podman/nginx.conf
@ -21,9 +21,9 @@ server {
    }
 }

-upstream grafana {
-    zone grafana_upstream 64k;
-	server grafana:3000 resolve;
+upstream prometheus {
+    zone prometheus_upstream 64k;
+	server prometheus:9090 resolve;
 }

 server {
@ -37,7 +37,7 @@ server {
    ssl_certificate_key /etc/letsencrypt/live/{{ inventory_hostname }}/privkey.pem;

 	add_header Strict-Transport-Security "max-age=31536000" always;
-	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL
+	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL to the Prometheus instance
    add_header X-Content-Type-Options "nosniff" always;
    add_header X-XSS-Protection "1; mode=block" always;

@ -47,6 +47,6 @@ server {
    proxy_set_header X-Forwarded-Proto $scheme;

    proxy_read_timeout 180;
-    proxy_pass http://grafana;
+    proxy_pass http://prometheus;
  }
 }
--- a/roles/podman_prometheus/templates/home/podman/prometheus.yml
+++ b/roles/podman_prometheus/templates/home/podman/prometheus.yml
@ -8,9 +8,6 @@ scrape_configs:
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9090']
-  - job_name: 'alertmanager'
-    static_configs:
-      - targets: ['alertmanager:9093']
  - job_name: 'node'
    scrape_interval: 5s
    scheme: https
@ -28,22 +25,9 @@ scrape_configs:
 {% for host in groups['keycloak'] %}
        - '{{ host }}:9100'
 {% endfor %}
-{% for host in groups['radius'] %}
-        - '{{ host }}:9100'
-{% endfor %}
 {% for host in groups['generic'] %}
        - '{{ host }}:9100'
 {% endfor %}
    file_sd_configs:
      - files:
          - "/file-configs/*.yml"
-
-alerting:
-  alertmanagers:
-  - scheme: http
-    static_configs:
-    - targets:
-      - "alertmanager:9093"
-
-rule_files:
-  - "/etc/alertmanager/alert.rules.yml"
--- a/roles/radius/defaults/main.yml
+++ b/roles/radius/defaults/main.yml
@ -1,6 +0,0 @@
---
-#radius_domain_name:
-radius_wap_ipaddr: 0.0.0.0/0
-#radius_wap_secret:
-radius_local_vlan: 1
-radius_guest_vlan: 3
--- a/roles/radius/handlers/main.yml
+++ b/roles/radius/handlers/main.yml
@ -1,6 +0,0 @@
---
- name: Restart radiusd
-  ansible.builtin.systemd_service:
-    name: radiusd
-    state: restarted
-  become: true
--- a/roles/radius/tasks/certs.yml
+++ b/roles/radius/tasks/certs.yml
@ -1,82 +0,0 @@
---
- name: "Radius Certificates | PATCH | Install latest certbot"
-  ansible.builtin.dnf:
-    name: certbot
-    state: latest
-    update_cache: true
-  become: true
-
- name: "Radius Certificates | AUDIT | Check for existing certificate expiry"
-  community.crypto.x509_certificate_info:
-    path: "/etc/letsencrypt/live/{{ inventory_hostname }}/cert.pem"
-  register: radius_certs_existing_cert
-  ignore_errors: true
-  become: true
-
- name: "Radius Certificates | AUDIT | Calculate days until expiry"
-  ansible.builtin.set_fact:
-    radius_certs_days_until_expiry: "{{ ((radius_certs_existing_cert.not_after | to_datetime('%Y%m%d%H%M%SZ')) - now()).days }}"
-  when: radius_certs_existing_cert.not_after is defined
-  become: true
-
- name: "Radius Certificates | AUDIT | Print days until expiry"
-  ansible.builtin.debug:
-    msg: "{{ radius_certs_days_until_expiry }}"
-  when: radius_certs_existing_cert.not_after is defined
-  become: true
-
- name: "Radius Certificates | PATCH | Request a new or renewed certificate"
-  when: (radius_certs_existing_cert.failed) or (radius_certs_days_until_expiry | int < 30)
-  become: true
-  block:
-    - name: "Radius Certificates | AUDIT | Check httpd"
-      ansible.builtin.systemd_service:
-        name: httpd
-      register: radius_certs_httpd_status
-
-    - name: "Radius Certificates | PATCH | Stop httpd"
-      ansible.builtin.systemd_service:
-        name: httpd
-        state: stopped
-      when: radius_certs_httpd_status.status.ActiveState == "active"
-
-    - name: "Radius Certificates | PATCH | Add http service to firewall"
-      ansible.posix.firewalld:
-        service: http
-        state: enabled
-
-    - name: "Radius Certificates | PATCH | Request new certificate"
-      ansible.builtin.command:
-        cmd: certbot certonly --standalone --preferred-challenges http --agree-tos -n -d {{ inventory_hostname }} --register-unsafely-without-email
-      when: radius_certs_existing_cert.failed
-
-    - name: "Radius Certificates | PATCH | Renew existing certificate"
-      ansible.builtin.command:
-        cmd: certbot renew
-      when: not radius_certs_existing_cert.failed
-
-    - name: "Radius Certificates | PATCH | Remove http service from firewall"
-      ansible.posix.firewalld:
-        service: http
-        state: disabled
-
-    - name: "Radius Certificates | PATCH | Start httpd"
-      ansible.builtin.systemd_service:
-        name: httpd
-        state: started
-      when: radius_certs_httpd_status.status.ActiveState == "active"
-
- name: Radius | PATCH | Allow radiusd access to certificates
-  ansible.builtin.copy:
-    src: /etc/letsencrypt/live/{{ inventory_hostname }}/{{ item }}.pem
-    dest: /etc/raddb/{{ item }}.pem
-    remote_src: true
-    owner: radiusd
-    group: radiusd
-    mode: "0640"
-  become: true
-  notify: Restart radiusd
-  with_items:
-    - privkey
-    - cert
-    - chain
--- a/roles/radius/tasks/main.yml
+++ b/roles/radius/tasks/main.yml
@ -1,40 +0,0 @@
---
- name: Radius | PATCH | Obtain or freshen certificates
-  ansible.builtin.include_tasks:
-    file: certs.yml
-
- name: Radius | PATCH | Install required packages
-  ansible.builtin.dnf:
-    name: freeradius
-    state: present
-  become: true
-
- name: Radius | PATCH | Install FreeRADIUS configuration files
-  ansible.builtin.template:
-    src: etc/raddb/{{ item }}
-    dest: /etc/raddb/{{ item }}
-    owner: root
-    group: radiusd
-    mode: 0640
-  become: true
-  with_items:
-    - mods-available/eap
-    - mods-available/linelog
-    - sites-available/default
-    - mods-available/inner-eap
-    - sites-available/inner-tunnel
-    - clients.conf
-    - proxy.conf
-  notify:
-    - Restart radiusd
-
- name: Radius | PATCH | Install rsyslog configuration
-  ansible.builtin.template:
-    src: etc/rsyslog.d/radiusd.conf
-    dest: /etc/rsyslog.d/radiusd.conf
-    owner: root
-    group: root
-    mode: 0644
-  become: true
-  notify:
-    - Reload rsyslog
--- a/roles/radius/templates/etc/raddb/clients.conf
+++ b/roles/radius/templates/etc/raddb/clients.conf
@ -1,22 +0,0 @@
-client eduroam_roaming0 {
-    ipaddr = roaming0.ja.net
-    secret = {{ radius_roaming0_secret }}
-    nastype = 'eduroam_flr'
-}
-
-client eduroam_roaming1 {
-    ipaddr = roaming1.ja.net
-    secret = {{ radius_roaming1_secret }}
-    nastype = 'eduroam_flr'
-}
-
-client eduroam_roaming2 {
-    ipaddr = roaming2.ja.net
-    secret = {{ radius_roaming2_secret }}
-    nastype = 'eduroam_flr'
-}
-
-client wireless_access_points_mgmt {
-	ipaddr = {{ radius_wap_ipaddr }}
-	secret = {{ radius_wap_secret }}
-}
--- a/roles/radius/templates/etc/raddb/mods-available/eap
+++ b/roles/radius/templates/etc/raddb/mods-available/eap
@ -1,52 +0,0 @@
-eap {
-	# The initial EAP type requested.  Change this to peap if you're
-	# using peap, or tls if you're using EAP-TLS.
-	default_eap_type = ttls
-
-	# The maximum time an EAP-Session can continue for
-	timer_expire = 60
-
-	# The maximum number of ongoing EAP sessions
-	max_sessions = ${max_requests}
-
-	tls-config tls-common {
-		# The public certificate that your server will present
-		certificate_file = /etc/raddb/cert.pem
-
-		# The private key for the public certificate
-		private_key_file = /etc/raddb/privkey.pem
-
-		# The password to decrypt 'private_key_file'
-		#private_key_password = ""
-
-		# The certificate of the authority that issued 'certificate_file'
-		ca_file = /etc/raddb/chain.pem
-
-		# If your AP drops packets towards the client, try reducing this.
-		fragment_size = 1024
-
-		# When issuing client certificates embed the OCSP URL in the
-		# certificate if you want to be able to revoke them later.
-		ocsp {
-			enable = yes
-			override_cert_url = no
-			use_nonce = yes
-		}
-	}
-
-	tls {
-		tls = tls-common
-	}
-
-	ttls {
-		tls = tls-common
-		default_eap_type = mschapv2
-		virtual_server = "eduroam-inner"
-	}
-
-	peap {
-		tls = tls-common
-		default_eap_type = mschapv2
-		virtual_server = "eduroam-inner"
-	}
-}
--- a/roles/radius/templates/etc/raddb/mods-available/inner-eap
+++ b/roles/radius/templates/etc/raddb/mods-available/inner-eap
@ -1,9 +0,0 @@
-eap inner-eap {
-	default_eap_type = mschapv2
-	timer_expire = 60
-	max_sessions = ${max_requests}
-
-	mschapv2 {
-		send_error = yes
-	}
-}
--- a/roles/radius/templates/etc/raddb/mods-available/linelog
+++ b/roles/radius/templates/etc/raddb/mods-available/linelog
@ -1,39 +0,0 @@
-linelog linelog_recv_request {
-	filename = syslog
-	syslog_facility = local0
-	syslog_severity = debug
-	format = "action = Recv-Request, %{pairs:request:}"
-}
-
-linelog linelog_send_accept {
-	filename = syslog
-	syslog_facility = local0
-	syslog_severity = debug
-	format = "action = Send-Accept, %{pairs:request:}"
-}
-
-linelog linelog_send_reject {
-	filename = syslog
-	syslog_facility = local0
-	syslog_severity = debug
-	format = "action = Send-Reject, %{pairs:request:}"
-}
-
-linelog linelog_send_proxy_request {
-	filename = syslog
-	syslog_facility = local0
-	syslog_severity = debug
-	format = "action = Send-Proxy-Request, %{pairs:proxy-request:}"
-}
-
-linelog linelog_recv_proxy_response {
-	filename = syslog
-	syslog_facility = local0
-	syslog_severity = debug
-	reference = "messages.%{proxy-reply:Response-Packet-Type}"
-	messages {
-		Access-Accept = "action = Recv-Proxy-Accept, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
-		Access-Reject = "action = Recv-Proxy-Reject, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
-		Access-Challenge = "action = Recv-Proxy-Challenge, User-Name = %{User-Name}, Calling-Station-ID = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
-	}
-}
--- a/roles/radius/templates/etc/raddb/proxy.conf
+++ b/roles/radius/templates/etc/raddb/proxy.conf
@ -1,38 +0,0 @@
-home_server eduroam_roaming0 {
-    ipaddr = roaming0.ja.net
-    secret = {{ radius_roaming0_secret }}
-    status_check = status-server
-    response_window = 5
-    check_interval = 10
-    check_timeout = 5
-}
-
-home_server eduroam_roaming1 {
-    ipaddr = roaming1.ja.net
-    secret = {{ radius_roaming1_secret }}
-    status_check = status-server
-    response_window = 5
-    check_interval = 10
-    check_timeout = 5
-}
-
-home_server eduroam_roaming2 {
-    ipaddr = roaming2.ja.net
-    secret = {{ radius_roaming2_secret }}
-    status_check = status-server
-    response_window = 5
-    check_interval = 10
-    check_timeout = 5
-}
-
-home_server_pool eduroam_flr_pool {
-    type = keyed-balance
-    home_server = eduroam_roaming0
-    home_server = eduroam_roaming1
-    home_server = eduroam_roaming2
-}
-
-realm eduroam_flr {
-    auth_pool = eduroam_flr_pool
-    nostrip
-}
--- a/roles/radius/templates/etc/raddb/sites-available/default
+++ b/roles/radius/templates/etc/raddb/sites-available/default
@ -1,112 +0,0 @@
-# The domain users will add to their username to have their credentials
-# routed to your institution.  You will also need to register this
-# and your RADIUS server addresses with your NRO.
-operator_name = "{{ radius_domain }}"
-
-# The VLAN to assign eduroam visitors
-eduroam_default_guest_vlan = "{{ radius_guest_vlan }}"
-
-# The VLAN to assign your students/staff
-eduroam_default_local_vlan = "{{ radius_local_vlan }}"
-
-server eduroam {
-	listen {
-		type = auth
-		ipv4addr = *
-		ipv6addr = *
-		port = 1812
-	}
-
-	authorize {
-		# Log requests before we change them
-		linelog_recv_request
-
-		# split_username_nai is a policy in the default distribution to
-		# split a username into username and domain.  We reject user-name
-		# strings without domains, as they're not routable.
-		split_username_nai
-		if (noop || !&Stripped-User-Domain) {
-			reject
-		}
-
-		# Send the request to the NRO for your region.
-		# The details of the FLRs (Federation Level RADIUS servers)
-		# are in proxy.conf.
-		# You can make this condition as complex as you like, to
-		# include additional subdomains just concatenate the conditions
-		# with &&.
-		if (&Stripped-User-Domain != "${operator_name}") {
-			update {
-				control:Load-Balance-Key := &Calling-Station-ID
-				control:Proxy-To-Realm := 'eduroam_flr'
-
-				# Operator name (RFC 5580) identifies the network the
-				# request originated from. It's not absolutely necessary
-				# but it helps with debugging.
-				request:Operator-Name := "1${operator_name}"
-			}
-			return
-		}
-
-		# If the EAP module returns 'ok' or 'updated', it means it has handled
-		# the request and we don't need to call any other modules in this
-		# section.
-		eap {
-			ok = return
-			updated = return
-		}
-	}
-
-	pre-proxy {
-		attr_filter.pre-proxy
-		linelog_send_proxy_request
-	}
-
-	post-proxy {
-		attr_filter.post-proxy
-		linelog_recv_proxy_response
-	}
-
-	authenticate {
-		eap
-	}
-
-	post-auth {
-		# To implement eduroam you must:
-		# - Use wireless access points or a controller which supports
-                #   dynamic VLAN assignments.
-		# - Have that feature enabled.
-		# - Have the guest_vlan/local_vlan available to the controller,
-                #   or to all your access points.
-		# eduroam user traffic *MUST* be segregated, this is *NOT* optional.
-		update reply {
-			Tunnel-Type := VLAN
-			Tunnel-Medium-Type := IEEE-802
-		}
-		if (&control:Proxy-To-Realm) {
-			update reply {
-				Tunnel-Private-Group-ID = ${eduroam_default_guest_vlan}
-			}
-		}
-		else {
-			update reply {
-				Tunnel-Private-Group-ID = ${eduroam_default_local_vlan}
-			}
-		}
-
-		# We're sending a response to one of OUR network devices for one of
-		# OUR users so provide it with the real user-identity.
-		if (&session-state:Stripped-User-Name) {
-			update reply {
-				User-Name := "%{session-state:Stripped-User-Name}@%{Stripped-User-Domain}"
-			}
-		}
-
-		linelog_send_accept
-
-		Post-Auth-Type REJECT {
-			attr_filter.access_reject
-			linelog_send_reject
-		}
-	}
-}
--- a/roles/radius/templates/etc/raddb/sites-available/inner-tunnel
+++ b/roles/radius/templates/etc/raddb/sites-available/inner-tunnel
@ -1,73 +0,0 @@
-server eduroam-inner {
-	listen {
-		type = auth
-		ipaddr = *
-		ipv6addr = *
-		port = 18120 # Used for testing only.  Requests proxied internally.
-	}
-
-	authorize {
-		# The outer username is considered garabage for autz purposes, but
-		# the domain portion of the outer and inner identities must match.
-		split_username_nai
-		if (noop || (&Stripped-User-Domain && \
-		    (&outer.Stripped-User-Domain != &Stripped-User-Domain))) {
-			reject
-		}
-
-		# Make the user's real identity available to anything that needs
-		# it in the outer server.
-		if (&outer.session-state:)
-			update {
-				&outer.session-state:Stripped-User-Name := &Stripped-User-Name
-			}
-		}
-
-		# EAP for PEAPv0 (EAP-MSCHAPv2)
-		inner-eap {
-			ok = return
-		}
-
-		# THIS IS SITE SPECIFIC
-		#
-		# The files module is *ONLY* used for testing.  It lets you define
-		# credentials in a flat file, IT WILL NOT SCALE.
-		#
-		# - If you use OpenLDAP with salted password hashes you should
- 		#   call the 'ldap' module here and use EAP-TTLS-PAP as your EAP method.
-		# - If you use OpenLDAP with cleartext passwords you should
-		#   call the 'ldap' module here and use EAP-TTLS or PEAPv0.
-		# - If you use an SQL DB with salted password hashes you should call
-		#   the 'sql' module here and use EAP-TTLS-PAP as your EAP method.
-		# - If you use an SQL DB with cleartext passwords you should call
-		#   the 'sql' module here and use EAP-TTLS or PEAPv0.
-		# - If you use Novell you should call the 'ldap' module here and
-		#   set ``edir = yes`` in ``mods-available/ldap`` and use EAP-TTLS or
-		#   PEAPv0.
-		# - If you use Active Directory, you don't need anything here (remove
-		#   the call to files) but you'll need to follow this
-		#   [guide](freeradius-active-directory-integration-howto) and use
-		#   EAP-TTLS-PAP or PEAPv0.
-		# - If you're using EAP-TLS (i'm impressed!) remove the call to files.
-		#
-		# EAP-TTLS-PAP and PEAPv0 are equally secure/insecure depending on how the
-		# supplicant is configured. PEAPv0 has a slight edge in that you need to
-		# crack MSCHAPv2 to get the user's password (but this is not hard).
-		files
-
-		pap
-		mschap
-	}
-
-	authenticate {
-		inner-eap
-		mschap
-		pap
-
-		# Comment pap above and uncomment the stanza below if you're using
-		# Active Directory; this will allow it to work with EAP-TTLS/PAP.
-		#Auth-Type pap {
-		#	ntlm_auth
-		#}
-	}
-}
--- a/roles/radius/templates/etc/rsyslog.d/radiusd.conf
+++ b/roles/radius/templates/etc/rsyslog.d/radiusd.conf
@ -1 +0,0 @@
-local0.debug					/var/log/radius_auth.log
				`@ -1 +0,0 @@`
				`{{ podman_prometheus_alertmanager_config \| to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}`