23 changed files with 12 additions and 935 deletions
--- a/playbooks/services.yml
+++ b/playbooks/services.yml
@ -23,6 +23,8 @@
    rhel9cis_rule_5_4_3_3: false
  roles:
    - role: sr2c.core.baseline
      baseline_epel_packages_allowed:
        - node-exporter
      tags: bootstrap
    - role: sr2c.core.freeipa
      become: true
@ -75,31 +77,3 @@
      tags: bootstrap
    - role: sr2c.core.node_exporter
      tags: prometheus
 - name: Deploy and update Radius server
  hosts:
    - radius
  roles:
    - role: sr2c.core.baseline
      vars:
        baseline_epel_packages_allowed:
          - certbot
          - python3-certbot
          - python3-pyrfc3339
          - python3-parsedatetime
          - python3-josepy
          - python3-importlib-metadata
          - python3-configargparse
          - python3-acme
          - python3-zipp
          - python3-pyOpenSSL
          - node-exporter
      tags: bootstrap
    - role: freeipa.ansible_freeipa.ipaclient
      become: true
      state: present
      tags: bootstrap
    - role: sr2c.core.node_exporter
      tags: prometheus
    - role: sr2c.core.radius
      tags: radius
--- a/roles/podman_prometheus/files/home/podman/alert.rules.yml
+++ b/roles/podman_prometheus/files/home/podman/alert.rules.yml
@ -1,302 +0,0 @@
 groups:
 - name: node_exporter_alerts
  rules:
  - alert: Node down
    expr: up == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      title: Node {{ $labels.instance }} is down
      description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of memory (instance {{ $labels.instance }})
      description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host memory under memory pressure (instance {{ $labels.instance }})
      description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput in (instance {{ $labels.instance }})
      description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput out (instance {{ $labels.instance }})
      description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskReadRate
    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read rate (instance {{ $labels.instance }})
      description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
  # TODO: Debug and reduce limit to 50
  - alert: HostUnusualDiskWriteRate
    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 65
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write rate (instance {{ $labels.instance }})
      description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of disk space (instance {{ $labels.instance }})
      description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostDiskWillFillIn24Hours
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
      description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of inodes (instance {{ $labels.instance }})
      description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
  - alert: HostInodesWillFillIn24Hours
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
      description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read latency (instance {{ $labels.instance }})
      description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
  - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write latency (instance {{ $labels.instance }})
      description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host high CPU load (instance {{ $labels.instance }})
      description: CPU load is > 80%\n  VALUE = {{ $value }}
  - alert: HostCpuStealNoisyNeighbor
    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
      description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
 #  TODO: Increase size of monitor instance
 #  # 1000 context switches is an arbitrary number.
 #  # Alert threshold depends on nature of application.
 #  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
 #  - alert: HostContextSwitching
 #    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
 #    for: 0m
 #    labels:
 #      severity: warning
 #    annotations:
 #      summary: Host context switching (instance {{ $labels.instance }})
 #      description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host swap is filling up (instance {{ $labels.instance }})
      description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host SystemD service crashed (instance {{ $labels.instance }})
      description: SystemD service crashed\n  VALUE = {{ $value }}
  - alert: HostPhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 75
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host physical component too hot (instance {{ $labels.instance }})
      description: Physical hardware component too hot\n  VALUE = {{ $value }}
  - alert: HostNodeOvertemperatureAlarm
    expr: node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
      description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
  - alert: HostRaidArrayGotInactive
    expr: node_md_state{state="inactive"} > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host RAID array got inactive (instance {{ $labels.instance }})
      description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="failed"} > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host RAID disk failure (instance {{ $labels.instance }})
      description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
 #  TODO: We have mix of Debian/Rocky/Alma systems
 #  - alert: HostKernelVersionDeviations
 #    expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
 #    for: 6h
 #    labels:
 #      severity: warning
 #    annotations:
 #      summary: Host kernel version deviations (instance {{ $labels.instance }})
 #      description: Different kernel versions are running\n  VALUE = {{ $value }}
  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host OOM kill detected (instance {{ $labels.instance }})
      description: OOM kill detected\n  VALUE = {{ $value }}
  - alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: info
    annotations:
      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
      description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
      description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkReceiveErrors
    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
      description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkTransmitErrors
    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
      description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
  - alert: HostNetworkInterfaceSaturated
    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
      description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
  - alert: HostConntrackLimit
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host conntrack limit (instance {{ $labels.instance }})
      description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
  - alert: HostClockSkew
    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock skew (instance {{ $labels.instance }})
      description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
  - alert: HostClockNotSynchronising
    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock not synchronising (instance {{ $labels.instance }})
      description: Clock not synchronising.\n  VALUE = {{ $value }}
--- a/roles/podman_prometheus/handlers/main.yml
+++ b/roles/podman_prometheus/handlers/main.yml
@ -1,20 +1,4 @@
 ---
 - name: Restart Alertmanager
  ansible.builtin.systemd_service:
    name: grafana
    scope: user
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart Grafana
  ansible.builtin.systemd_service:
    name: grafana
    scope: user
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart Prometheus
  ansible.builtin.systemd_service:
    name: prometheus
@ -22,12 +6,3 @@
    state: restarted
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
 - name: Restart nginx
  ansible.builtin.systemd_service:
    name: nginx
    state: restarted
    scope: user
    daemon_reload: true
  become: true
  become_user: "{{ podman_prometheus_podman_rootless_user }}"
--- a/roles/podman_prometheus/tasks/main.yml
+++ b/roles/podman_prometheus/tasks/main.yml
@ -55,52 +55,15 @@
 # Prometheus runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Prometheus configuration
  ansible.builtin.template:
-    src: "home/podman/{{ item }}"
+    src: home/podman/prometheus.yml
-    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
+    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  with_items:
    - prometheus.yml
  notify:
    - Restart Prometheus
 - name: Podman Prometheus | PATCH | Install Prometheus alert rules
  ansible.builtin.copy:
    src: "home/podman/{{ item }}"
    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  with_items:
    - alert.rules.yml
  notify:
    - Restart Prometheus
 # Alertmanager runs with UID/GID 65534 inside the container
 - name: Podman Prometheus | PATCH | Install Alertmanager configuration
  ansible.builtin.template:
    src: home/podman/alertmanager.yml
    dest: "/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml"
    mode: "0400"
    owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 65533 }}"
  become: true
  notify:
    - Restart Alertmanager
 # Grafana runs with UID/GID 472 inside the container
 - name: Podman Prometheus | PATCH | Create data directory for Grafana
  ansible.builtin.file:
    path: "/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data"
    owner: "{{ _podman_prometheus_user_subuid_start + 471 }}"
    group: "{{ _podman_prometheus_user_subgid_start + 471 }}"
    mode: "0700"
    state: "directory"
  become: true
 - name: Podman Prometheus | PATCH | Install container quadlets
  ansible.builtin.template:
    src: "home/podman/config/containers/systemd/{{ item }}"
@ -108,12 +71,9 @@
    owner: "{{ podman_prometheus_podman_rootless_user }}"
    mode: "0400"
  with_items:
    - alertmanager.container
    - grafana.container
    - prometheus.container
  become: true
  notify:
    - Restart Grafana
    - Restart Prometheus
 - name: Podman Prometheus | PATCH | Install network quadlets
@ -124,11 +84,8 @@
    mode: "0400"
  with_items:
    - frontend.network
    - monitor.network
  become: true
  notify:
    - Restart Alertmanager
    - Restart Grafana
    - Restart Prometheus
    - Restart nginx
@ -165,7 +122,7 @@
  notify:
    - Restart nginx
- name: Podman Prometheus | PATCH | Make sure Prometheus, Grafana and Nginx are running now and started on boot
+- name: Podman Prometheus | PATCH | Make sure Prometheus and Nginx are running now and started on boot
  ansible.builtin.systemd_service:
    name: "{{ item }}.service"
    enabled: true
@ -174,8 +131,6 @@
    daemon_reload: true
    scope: user
  with_items:
    - alertmanager
    - grafana
    - nginx
    - prometheus
  become: true
--- a/roles/podman_prometheus/templates/home/podman/alertmanager.yml
+++ b/roles/podman_prometheus/templates/home/podman/alertmanager.yml
@ -1 +0,0 @@
 {{ podman_prometheus_alertmanager_config | to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container
@ -1,11 +0,0 @@
 [Container]
 ContainerName=alertmanager
 Image=quay.io/prometheus/alertmanager:v0.31.1
 Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,Z
 [Service]
 Restart=on-failure
 [Install]
 WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container
@ -1,14 +0,0 @@
 [Container]
 ContainerName=grafana
 Image=docker.io/grafana/grafana
 Environment=GF_SERVER_DOMAIN={{ inventory_hostname }}
 Environment=GF_SERVER_ROOT_URL=https://%%(domain)s/
 Network=frontend.network
 Network=monitor.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data:/var/lib/grafana:rw,Z
 [Service]
 Restart=on-failure
 [Install]
 WantedBy=default.target
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network
@ -1,2 +0,0 @@
 [Network]
 NetworkName=network
--- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
+++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container
@ -1,11 +1,10 @@
 [Container]
 ContainerName=prometheus
-Image=quay.io/prometheus/prometheus:v3.9.1
+Image=quay.io/prometheus/prometheus:v3.8.1
-Network=monitor.network
+Network=frontend.network
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus-data:/prometheus:rw,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/file-configs:/file-configs:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z
 Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alert.rules.yml:/etc/alertmanager/alert.rules.yml:ro,Z
 [Service]
 Restart=on-failure
--- a/roles/podman_prometheus/templates/home/podman/nginx.conf
+++ b/roles/podman_prometheus/templates/home/podman/nginx.conf
@ -21,9 +21,9 @@ server {
    }
 }
-upstream grafana {
+upstream prometheus {
-    zone grafana_upstream 64k;
+    zone prometheus_upstream 64k;
-	server grafana:3000 resolve;
+	server prometheus:9090 resolve;
 }
 server {
@ -37,7 +37,7 @@ server {
    ssl_certificate_key /etc/letsencrypt/live/{{ inventory_hostname }}/privkey.pem;
 	add_header Strict-Transport-Security "max-age=31536000" always;
-	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL
+	add_header Referrer-Policy origin always;  # make sure outgoing links don't show the URL to the Prometheus instance
    add_header X-Content-Type-Options "nosniff" always;
    add_header X-XSS-Protection "1; mode=block" always;
@ -47,6 +47,6 @@ server {
    proxy_set_header X-Forwarded-Proto $scheme;
    proxy_read_timeout 180;
-    proxy_pass http://grafana;
+    proxy_pass http://prometheus;
  }
 }
--- a/roles/podman_prometheus/templates/home/podman/prometheus.yml
+++ b/roles/podman_prometheus/templates/home/podman/prometheus.yml
@ -8,9 +8,6 @@ scrape_configs:
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9090']
  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']
  - job_name: 'node'
    scrape_interval: 5s
    scheme: https
@ -28,22 +25,9 @@ scrape_configs:
 {% for host in groups['keycloak'] %}
        - '{{ host }}:9100'
 {% endfor %}
 {% for host in groups['radius'] %}
        - '{{ host }}:9100'
 {% endfor %}
 {% for host in groups['generic'] %}
        - '{{ host }}:9100'
 {% endfor %}
    file_sd_configs:
      - files:
          - "/file-configs/*.yml"
 alerting:
  alertmanagers:
  - scheme: http
    static_configs:
    - targets:
      - "alertmanager:9093"
 rule_files:
  - "/etc/alertmanager/alert.rules.yml"
--- a/roles/radius/defaults/main.yml
+++ b/roles/radius/defaults/main.yml
@ -1,6 +0,0 @@
 ---
 #radius_domain_name:
 radius_wap_ipaddr: 0.0.0.0/0
 #radius_wap_secret:
 radius_local_vlan: 1
 radius_guest_vlan: 3
--- a/roles/radius/handlers/main.yml
+++ b/roles/radius/handlers/main.yml
@ -1,6 +0,0 @@
 ---
 - name: Restart radiusd
  ansible.builtin.systemd_service:
    name: radiusd
    state: restarted
  become: true
--- a/roles/radius/tasks/certs.yml
+++ b/roles/radius/tasks/certs.yml
@ -1,82 +0,0 @@
 ---
 - name: "Radius Certificates | PATCH | Install latest certbot"
  ansible.builtin.dnf:
    name: certbot
    state: latest
    update_cache: true
  become: true
 - name: "Radius Certificates | AUDIT | Check for existing certificate expiry"
  community.crypto.x509_certificate_info:
    path: "/etc/letsencrypt/live/{{ inventory_hostname }}/cert.pem"
  register: radius_certs_existing_cert
  ignore_errors: true
  become: true
 - name: "Radius Certificates | AUDIT | Calculate days until expiry"
  ansible.builtin.set_fact:
    radius_certs_days_until_expiry: "{{ ((radius_certs_existing_cert.not_after | to_datetime('%Y%m%d%H%M%SZ')) - now()).days }}"
  when: radius_certs_existing_cert.not_after is defined
  become: true
 - name: "Radius Certificates | AUDIT | Print days until expiry"
  ansible.builtin.debug:
    msg: "{{ radius_certs_days_until_expiry }}"
  when: radius_certs_existing_cert.not_after is defined
  become: true
 - name: "Radius Certificates | PATCH | Request a new or renewed certificate"
  when: (radius_certs_existing_cert.failed) or (radius_certs_days_until_expiry | int < 30)
  become: true
  block:
    - name: "Radius Certificates | AUDIT | Check httpd"
      ansible.builtin.systemd_service:
        name: httpd
      register: radius_certs_httpd_status
    - name: "Radius Certificates | PATCH | Stop httpd"
      ansible.builtin.systemd_service:
        name: httpd
        state: stopped
      when: radius_certs_httpd_status.status.ActiveState == "active"
    - name: "Radius Certificates | PATCH | Add http service to firewall"
      ansible.posix.firewalld:
        service: http
        state: enabled
    - name: "Radius Certificates | PATCH | Request new certificate"
      ansible.builtin.command:
        cmd: certbot certonly --standalone --preferred-challenges http --agree-tos -n -d {{ inventory_hostname }} --register-unsafely-without-email
      when: radius_certs_existing_cert.failed
    - name: "Radius Certificates | PATCH | Renew existing certificate"
      ansible.builtin.command:
        cmd: certbot renew
      when: not radius_certs_existing_cert.failed
    - name: "Radius Certificates | PATCH | Remove http service from firewall"
      ansible.posix.firewalld:
        service: http
        state: disabled
    - name: "Radius Certificates | PATCH | Start httpd"
      ansible.builtin.systemd_service:
        name: httpd
        state: started
      when: radius_certs_httpd_status.status.ActiveState == "active"
 - name: Radius | PATCH | Allow radiusd access to certificates
  ansible.builtin.copy:
    src: /etc/letsencrypt/live/{{ inventory_hostname }}/{{ item }}.pem
    dest: /etc/raddb/{{ item }}.pem
    remote_src: true
    owner: radiusd
    group: radiusd
    mode: "0640"
  become: true
  notify: Restart radiusd
  with_items:
    - privkey
    - cert
    - chain
--- a/roles/radius/tasks/main.yml
+++ b/roles/radius/tasks/main.yml
@ -1,40 +0,0 @@
 ---
 - name: Radius | PATCH | Obtain or freshen certificates
  ansible.builtin.include_tasks:
    file: certs.yml
 - name: Radius | PATCH | Install required packages
  ansible.builtin.dnf:
    name: freeradius
    state: present
  become: true
 - name: Radius | PATCH | Install FreeRADIUS configuration files
  ansible.builtin.template:
    src: etc/raddb/{{ item }}
    dest: /etc/raddb/{{ item }}
    owner: root
    group: radiusd
    mode: 0640
  become: true
  with_items:
    - mods-available/eap
    - mods-available/linelog
    - sites-available/default
    - mods-available/inner-eap
    - sites-available/inner-tunnel
    - clients.conf
    - proxy.conf
  notify:
    - Restart radiusd
 - name: Radius | PATCH | Install rsyslog configuration
  ansible.builtin.template:
    src: etc/rsyslog.d/radiusd.conf
    dest: /etc/rsyslog.d/radiusd.conf
    owner: root
    group: root
    mode: 0644
  become: true
  notify:
    - Reload rsyslog
--- a/roles/radius/templates/etc/raddb/clients.conf
+++ b/roles/radius/templates/etc/raddb/clients.conf
@ -1,22 +0,0 @@
 client eduroam_roaming0 {
    ipaddr = roaming0.ja.net
    secret = {{ radius_roaming0_secret }}
    nastype = 'eduroam_flr'
 }
 client eduroam_roaming1 {
    ipaddr = roaming1.ja.net
    secret = {{ radius_roaming1_secret }}
    nastype = 'eduroam_flr'
 }
 client eduroam_roaming2 {
    ipaddr = roaming2.ja.net
    secret = {{ radius_roaming2_secret }}
    nastype = 'eduroam_flr'
 }
 client wireless_access_points_mgmt {
 	ipaddr = {{ radius_wap_ipaddr }}
 	secret = {{ radius_wap_secret }}
 }
--- a/roles/radius/templates/etc/raddb/mods-available/eap
+++ b/roles/radius/templates/etc/raddb/mods-available/eap
@ -1,52 +0,0 @@
 eap {
 	# The initial EAP type requested.  Change this to peap if you're
 	# using peap, or tls if you're using EAP-TLS.
 	default_eap_type = ttls
 	# The maximum time an EAP-Session can continue for
 	timer_expire = 60
 	# The maximum number of ongoing EAP sessions
 	max_sessions = ${max_requests}
 	tls-config tls-common {
 		# The public certificate that your server will present
 		certificate_file = /etc/raddb/cert.pem
 		# The private key for the public certificate
 		private_key_file = /etc/raddb/privkey.pem
 		# The password to decrypt 'private_key_file'
 		#private_key_password = ""
 		# The certificate of the authority that issued 'certificate_file'
 		ca_file = /etc/raddb/chain.pem
 		# If your AP drops packets towards the client, try reducing this.
 		fragment_size = 1024
 		# When issuing client certificates embed the OCSP URL in the
 		# certificate if you want to be able to revoke them later.
 		ocsp {
 			enable = yes
 			override_cert_url = no
 			use_nonce = yes
 		}
 	}
 	tls {
 		tls = tls-common
 	}
 	ttls {
 		tls = tls-common
 		default_eap_type = mschapv2
 		virtual_server = "eduroam-inner"
 	}
 	peap {
 		tls = tls-common
 		default_eap_type = mschapv2
 		virtual_server = "eduroam-inner"
 	}
 }
--- a/roles/radius/templates/etc/raddb/mods-available/inner-eap
+++ b/roles/radius/templates/etc/raddb/mods-available/inner-eap
@ -1,9 +0,0 @@
 eap inner-eap {
 	default_eap_type = mschapv2
 	timer_expire = 60
 	max_sessions = ${max_requests}
 	mschapv2 {
 		send_error = yes
 	}
 }
--- a/roles/radius/templates/etc/raddb/mods-available/linelog
+++ b/roles/radius/templates/etc/raddb/mods-available/linelog
@ -1,39 +0,0 @@
 linelog linelog_recv_request {
 	filename = syslog
 	syslog_facility = local0
 	syslog_severity = debug
 	format = "action = Recv-Request, %{pairs:request:}"
 }
 linelog linelog_send_accept {
 	filename = syslog
 	syslog_facility = local0
 	syslog_severity = debug
 	format = "action = Send-Accept, %{pairs:request:}"
 }
 linelog linelog_send_reject {
 	filename = syslog
 	syslog_facility = local0
 	syslog_severity = debug
 	format = "action = Send-Reject, %{pairs:request:}"
 }
 linelog linelog_send_proxy_request {
 	filename = syslog
 	syslog_facility = local0
 	syslog_severity = debug
 	format = "action = Send-Proxy-Request, %{pairs:proxy-request:}"
 }
 linelog linelog_recv_proxy_response {
 	filename = syslog
 	syslog_facility = local0
 	syslog_severity = debug
 	reference = "messages.%{proxy-reply:Response-Packet-Type}"
 	messages {
 		Access-Accept = "action = Recv-Proxy-Accept, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
 		Access-Reject = "action = Recv-Proxy-Reject, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
 		Access-Challenge = "action = Recv-Proxy-Challenge, User-Name = %{User-Name}, Calling-Station-ID = %{Calling-Station-Id}, %{pairs:proxy-reply:}"
 	}
 }
--- a/roles/radius/templates/etc/raddb/proxy.conf
+++ b/roles/radius/templates/etc/raddb/proxy.conf
@ -1,38 +0,0 @@
 home_server eduroam_roaming0 {
    ipaddr = roaming0.ja.net
    secret = {{ radius_roaming0_secret }}
    status_check = status-server
    response_window = 5
    check_interval = 10
    check_timeout = 5
 }
 home_server eduroam_roaming1 {
    ipaddr = roaming1.ja.net
    secret = {{ radius_roaming1_secret }}
    status_check = status-server
    response_window = 5
    check_interval = 10
    check_timeout = 5
 }
 home_server eduroam_roaming2 {
    ipaddr = roaming2.ja.net
    secret = {{ radius_roaming2_secret }}
    status_check = status-server
    response_window = 5
    check_interval = 10
    check_timeout = 5
 }
 home_server_pool eduroam_flr_pool {
    type = keyed-balance
    home_server = eduroam_roaming0
    home_server = eduroam_roaming1
    home_server = eduroam_roaming2
 }
 realm eduroam_flr {
    auth_pool = eduroam_flr_pool
    nostrip
 }
--- a/roles/radius/templates/etc/raddb/sites-available/default
+++ b/roles/radius/templates/etc/raddb/sites-available/default
@ -1,112 +0,0 @@
 # The domain users will add to their username to have their credentials
 # routed to your institution.  You will also need to register this
 # and your RADIUS server addresses with your NRO.
 operator_name = "{{ radius_domain }}"
 # The VLAN to assign eduroam visitors
 eduroam_default_guest_vlan = "{{ radius_guest_vlan }}"
 # The VLAN to assign your students/staff
 eduroam_default_local_vlan = "{{ radius_local_vlan }}"
 server eduroam {
 	listen {
 		type = auth
 		ipv4addr = *
 		ipv6addr = *
 		port = 1812
 	}
 	authorize {
 		# Log requests before we change them
 		linelog_recv_request
 		# split_username_nai is a policy in the default distribution to
 		# split a username into username and domain.  We reject user-name
 		# strings without domains, as they're not routable.
 		split_username_nai
 		if (noop || !&Stripped-User-Domain) {
 			reject
 		}
 		# Send the request to the NRO for your region.
 		# The details of the FLRs (Federation Level RADIUS servers)
 		# are in proxy.conf.
 		# You can make this condition as complex as you like, to
 		# include additional subdomains just concatenate the conditions
 		# with &&.
 		if (&Stripped-User-Domain != "${operator_name}") {
 			update {
 				control:Load-Balance-Key := &Calling-Station-ID
 				control:Proxy-To-Realm := 'eduroam_flr'
 				# Operator name (RFC 5580) identifies the network the
 				# request originated from. It's not absolutely necessary
 				# but it helps with debugging.
 				request:Operator-Name := "1${operator_name}"
 			}
 			return
 		}
 		# If the EAP module returns 'ok' or 'updated', it means it has handled
 		# the request and we don't need to call any other modules in this
 		# section.
 		eap {
 			ok = return
 			updated = return
 		}
 	}
 	pre-proxy {
 		attr_filter.pre-proxy
 		linelog_send_proxy_request
 	}
 	post-proxy {
 		attr_filter.post-proxy
 		linelog_recv_proxy_response
 	}
 	authenticate {
 		eap
 	}
 	post-auth {
 		# To implement eduroam you must:
 		# - Use wireless access points or a controller which supports
                #   dynamic VLAN assignments.
 		# - Have that feature enabled.
 		# - Have the guest_vlan/local_vlan available to the controller,
                #   or to all your access points.
 		# eduroam user traffic *MUST* be segregated, this is *NOT* optional.
 		update reply {
 			Tunnel-Type := VLAN
 			Tunnel-Medium-Type := IEEE-802
 		}
 		if (&control:Proxy-To-Realm) {
 			update reply {
 				Tunnel-Private-Group-ID = ${eduroam_default_guest_vlan}
 			}
 		}
 		else {
 			update reply {
 				Tunnel-Private-Group-ID = ${eduroam_default_local_vlan}
 			}
 		}
 		# We're sending a response to one of OUR network devices for one of
 		# OUR users so provide it with the real user-identity.
 		if (&session-state:Stripped-User-Name) {
 			update reply {
 				User-Name := "%{session-state:Stripped-User-Name}@%{Stripped-User-Domain}"
 			}
 		}
 		linelog_send_accept
 		Post-Auth-Type REJECT {
 			attr_filter.access_reject
 			linelog_send_reject
 		}
 	}
 }
--- a/roles/radius/templates/etc/raddb/sites-available/inner-tunnel
+++ b/roles/radius/templates/etc/raddb/sites-available/inner-tunnel
@ -1,73 +0,0 @@
 server eduroam-inner {
 	listen {
 		type = auth
 		ipaddr = *
 		ipv6addr = *
 		port = 18120 # Used for testing only.  Requests proxied internally.
 	}
 	authorize {
 		# The outer username is considered garabage for autz purposes, but
 		# the domain portion of the outer and inner identities must match.
 		split_username_nai
 		if (noop || (&Stripped-User-Domain && \
 		    (&outer.Stripped-User-Domain != &Stripped-User-Domain))) {
 			reject
 		}
 		# Make the user's real identity available to anything that needs
 		# it in the outer server.
 		if (&outer.session-state:)
 			update {
 				&outer.session-state:Stripped-User-Name := &Stripped-User-Name
 			}
 		}
 		# EAP for PEAPv0 (EAP-MSCHAPv2)
 		inner-eap {
 			ok = return
 		}
 		# THIS IS SITE SPECIFIC
 		#
 		# The files module is *ONLY* used for testing.  It lets you define
 		# credentials in a flat file, IT WILL NOT SCALE.
 		#
 		# - If you use OpenLDAP with salted password hashes you should
 		#   call the 'ldap' module here and use EAP-TTLS-PAP as your EAP method.
 		# - If you use OpenLDAP with cleartext passwords you should
 		#   call the 'ldap' module here and use EAP-TTLS or PEAPv0.
 		# - If you use an SQL DB with salted password hashes you should call
 		#   the 'sql' module here and use EAP-TTLS-PAP as your EAP method.
 		# - If you use an SQL DB with cleartext passwords you should call
 		#   the 'sql' module here and use EAP-TTLS or PEAPv0.
 		# - If you use Novell you should call the 'ldap' module here and
 		#   set ``edir = yes`` in ``mods-available/ldap`` and use EAP-TTLS or
 		#   PEAPv0.
 		# - If you use Active Directory, you don't need anything here (remove
 		#   the call to files) but you'll need to follow this
 		#   [guide](freeradius-active-directory-integration-howto) and use
 		#   EAP-TTLS-PAP or PEAPv0.
 		# - If you're using EAP-TLS (i'm impressed!) remove the call to files.
 		#
 		# EAP-TTLS-PAP and PEAPv0 are equally secure/insecure depending on how the
 		# supplicant is configured. PEAPv0 has a slight edge in that you need to
 		# crack MSCHAPv2 to get the user's password (but this is not hard).
 		files
 		pap
 		mschap
 	}
 	authenticate {
 		inner-eap
 		mschap
 		pap
 		# Comment pap above and uncomment the stanza below if you're using
 		# Active Directory; this will allow it to work with EAP-TTLS/PAP.
 		#Auth-Type pap {
 		#	ntlm_auth
 		#}
 	}
 }
--- a/roles/radius/templates/etc/rsyslog.d/radiusd.conf
+++ b/roles/radius/templates/etc/rsyslog.d/radiusd.conf
@ -1 +0,0 @@
 local0.debug					/var/log/radius_auth.log
		`@ -1 +0,0 @@`
			`{{ podman_prometheus_alertmanager_config \| to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }}`