diff --git a/playbooks/services.yml b/playbooks/services.yml index 22dabd7..c06a62f 100644 --- a/playbooks/services.yml +++ b/playbooks/services.yml @@ -23,6 +23,8 @@ rhel9cis_rule_5_4_3_3: false roles: - role: sr2c.core.baseline + baseline_epel_packages_allowed: + - node-exporter tags: bootstrap - role: sr2c.core.freeipa become: true @@ -75,31 +77,3 @@ tags: bootstrap - role: sr2c.core.node_exporter tags: prometheus - -- name: Deploy and update Radius server - hosts: - - radius - roles: - - role: sr2c.core.baseline - vars: - baseline_epel_packages_allowed: - - certbot - - python3-certbot - - python3-pyrfc3339 - - python3-parsedatetime - - python3-josepy - - python3-importlib-metadata - - python3-configargparse - - python3-acme - - python3-zipp - - python3-pyOpenSSL - - node-exporter - tags: bootstrap - - role: freeipa.ansible_freeipa.ipaclient - become: true - state: present - tags: bootstrap - - role: sr2c.core.node_exporter - tags: prometheus - - role: sr2c.core.radius - tags: radius diff --git a/roles/podman_prometheus/files/home/podman/alert.rules.yml b/roles/podman_prometheus/files/home/podman/alert.rules.yml deleted file mode 100644 index 381af39..0000000 --- a/roles/podman_prometheus/files/home/podman/alert.rules.yml +++ /dev/null @@ -1,302 +0,0 @@ -groups: -- name: node_exporter_alerts - rules: - - alert: Node down - expr: up == 0 - for: 2m - labels: - severity: warning - annotations: - title: Node {{ $labels.instance }} is down - description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down. - - - alert: HostOutOfMemory - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory (instance {{ $labels.instance }}) - description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} - - - alert: HostMemoryUnderMemoryPressure - expr: rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure (instance {{ $labels.instance }}) - description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }} - - - alert: HostUnusualNetworkThroughputIn - expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }} - - - alert: HostUnusualNetworkThroughputOut - expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput out (instance {{ $labels.instance }}) - description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }} - - - alert: HostUnusualDiskReadRate - expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk read rate (instance {{ $labels.instance }}) - description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }} - - # TODO: Debug and reduce limit to 50 - - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 65 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write rate (instance {{ $labels.instance }}) - description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }} - - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of disk space (instance {{ $labels.instance }}) - description: Disk is almost full (< 10% left)\n VALUE = {{ $value }} - - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostDiskWillFillIn24Hours - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) - description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }} - - - alert: HostOutOfInodes - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of inodes (instance {{ $labels.instance }}) - description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }} - - - alert: HostInodesWillFillIn24Hours - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) - description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }} - - - alert: HostUnusualDiskReadLatency - expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk read latency (instance {{ $labels.instance }}) - description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }} - - - alert: HostUnusualDiskWriteLatency - expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write latency (instance {{ $labels.instance }}) - description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }} - - - alert: HostHighCpuLoad - expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 - for: 0m - labels: - severity: warning - annotations: - summary: Host high CPU load (instance {{ $labels.instance }}) - description: CPU load is > 80%\n VALUE = {{ $value }} - - - alert: HostCpuStealNoisyNeighbor - expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) - description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }} - -# TODO: Increase size of monitor instance -# # 1000 context switches is an arbitrary number. -# # Alert threshold depends on nature of application. -# # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 -# - alert: HostContextSwitching -# expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000 -# for: 0m -# labels: -# severity: warning -# annotations: -# summary: Host context switching (instance {{ $labels.instance }}) -# description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }} - - - alert: HostSwapIsFillingUp - expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Host swap is filling up (instance {{ $labels.instance }}) - description: Swap is filling up (>80%)\n VALUE = {{ $value }} - - - alert: HostSystemdServiceCrashed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 0m - labels: - severity: warning - annotations: - summary: Host SystemD service crashed (instance {{ $labels.instance }}) - description: SystemD service crashed\n VALUE = {{ $value }} - - - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot (instance {{ $labels.instance }}) - description: Physical hardware component too hot\n VALUE = {{ $value }} - - - alert: HostNodeOvertemperatureAlarm - expr: node_hwmon_temp_crit_alarm_celsius == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Host node overtemperature alarm (instance {{ $labels.instance }}) - description: Physical node temperature alarm triggered\n VALUE = {{ $value }} - - - alert: HostRaidArrayGotInactive - expr: node_md_state{state="inactive"} > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Host RAID array got inactive (instance {{ $labels.instance }}) - description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }} - - - alert: HostRaidDiskFailure - expr: node_md_disks{state="failed"} > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host RAID disk failure (instance {{ $labels.instance }}) - description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }} - -# TODO: We have mix of Debian/Rocky/Alma systems -# - alert: HostKernelVersionDeviations -# expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 -# for: 6h -# labels: -# severity: warning -# annotations: -# summary: Host kernel version deviations (instance {{ $labels.instance }}) -# description: Different kernel versions are running\n VALUE = {{ $value }} - - - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: OOM kill detected\n VALUE = {{ $value }} - - - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: info - annotations: - summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) - description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} - - - alert: HostEdacUncorrectableErrorsDetected - expr: node_edac_uncorrectable_errors_total > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) - description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} - - - alert: HostNetworkReceiveErrors - expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}) - description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }} - - - alert: HostNetworkTransmitErrors - expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}) - description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }} - - - alert: HostNetworkInterfaceSaturated - expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 - for: 1m - labels: - severity: warning - annotations: - summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}) - description: The network interface is getting overloaded.\n VALUE = {{ $value }} - - - alert: HostConntrackLimit - expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: Host conntrack limit (instance {{ $labels.instance }}) - description: The number of conntrack is approching limit\n VALUE = {{ $value }} - - - alert: HostClockSkew - expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) - for: 2m - labels: - severity: warning - annotations: - summary: Host clock skew (instance {{ $labels.instance }}) - description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }} - - - alert: HostClockNotSynchronising - expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising (instance {{ $labels.instance }}) - description: Clock not synchronising.\n VALUE = {{ $value }} diff --git a/roles/podman_prometheus/handlers/main.yml b/roles/podman_prometheus/handlers/main.yml index 91c6ade..7165847 100644 --- a/roles/podman_prometheus/handlers/main.yml +++ b/roles/podman_prometheus/handlers/main.yml @@ -1,20 +1,4 @@ --- -- name: Restart Alertmanager - ansible.builtin.systemd_service: - name: grafana - scope: user - state: restarted - become: true - become_user: "{{ podman_prometheus_podman_rootless_user }}" - -- name: Restart Grafana - ansible.builtin.systemd_service: - name: grafana - scope: user - state: restarted - become: true - become_user: "{{ podman_prometheus_podman_rootless_user }}" - - name: Restart Prometheus ansible.builtin.systemd_service: name: prometheus @@ -22,12 +6,3 @@ state: restarted become: true become_user: "{{ podman_prometheus_podman_rootless_user }}" - -- name: Restart nginx - ansible.builtin.systemd_service: - name: nginx - state: restarted - scope: user - daemon_reload: true - become: true - become_user: "{{ podman_prometheus_podman_rootless_user }}" diff --git a/roles/podman_prometheus/tasks/main.yml b/roles/podman_prometheus/tasks/main.yml index d3c60e1..962b535 100644 --- a/roles/podman_prometheus/tasks/main.yml +++ b/roles/podman_prometheus/tasks/main.yml @@ -55,52 +55,15 @@ # Prometheus runs with UID/GID 65534 inside the container - name: Podman Prometheus | PATCH | Install Prometheus configuration ansible.builtin.template: - src: "home/podman/{{ item }}" - dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}" + src: home/podman/prometheus.yml + dest: "/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml" mode: "0400" owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}" group: "{{ _podman_prometheus_user_subgid_start + 65533 }}" become: true - with_items: - - prometheus.yml notify: - Restart Prometheus -- name: Podman Prometheus | PATCH | Install Prometheus alert rules - ansible.builtin.copy: - src: "home/podman/{{ item }}" - dest: "/home/{{ podman_prometheus_podman_rootless_user }}/{{ item }}" - mode: "0400" - owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}" - group: "{{ _podman_prometheus_user_subgid_start + 65533 }}" - become: true - with_items: - - alert.rules.yml - notify: - - Restart Prometheus - -# Alertmanager runs with UID/GID 65534 inside the container -- name: Podman Prometheus | PATCH | Install Alertmanager configuration - ansible.builtin.template: - src: home/podman/alertmanager.yml - dest: "/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml" - mode: "0400" - owner: "{{ _podman_prometheus_user_subuid_start + 65533 }}" - group: "{{ _podman_prometheus_user_subgid_start + 65533 }}" - become: true - notify: - - Restart Alertmanager - -# Grafana runs with UID/GID 472 inside the container -- name: Podman Prometheus | PATCH | Create data directory for Grafana - ansible.builtin.file: - path: "/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data" - owner: "{{ _podman_prometheus_user_subuid_start + 471 }}" - group: "{{ _podman_prometheus_user_subgid_start + 471 }}" - mode: "0700" - state: "directory" - become: true - - name: Podman Prometheus | PATCH | Install container quadlets ansible.builtin.template: src: "home/podman/config/containers/systemd/{{ item }}" @@ -108,12 +71,9 @@ owner: "{{ podman_prometheus_podman_rootless_user }}" mode: "0400" with_items: - - alertmanager.container - - grafana.container - prometheus.container become: true notify: - - Restart Grafana - Restart Prometheus - name: Podman Prometheus | PATCH | Install network quadlets @@ -124,11 +84,8 @@ mode: "0400" with_items: - frontend.network - - monitor.network become: true notify: - - Restart Alertmanager - - Restart Grafana - Restart Prometheus - Restart nginx @@ -165,7 +122,7 @@ notify: - Restart nginx -- name: Podman Prometheus | PATCH | Make sure Prometheus, Grafana and Nginx are running now and started on boot +- name: Podman Prometheus | PATCH | Make sure Prometheus and Nginx are running now and started on boot ansible.builtin.systemd_service: name: "{{ item }}.service" enabled: true @@ -174,8 +131,6 @@ daemon_reload: true scope: user with_items: - - alertmanager - - grafana - nginx - prometheus become: true diff --git a/roles/podman_prometheus/templates/home/podman/alertmanager.yml b/roles/podman_prometheus/templates/home/podman/alertmanager.yml deleted file mode 100644 index cd7aa1d..0000000 --- a/roles/podman_prometheus/templates/home/podman/alertmanager.yml +++ /dev/null @@ -1 +0,0 @@ -{{ podman_prometheus_alertmanager_config | to_nice_yaml( width=50, explicit_start=True, explicit_end=True) }} diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container deleted file mode 100644 index d9fb543..0000000 --- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/alertmanager.container +++ /dev/null @@ -1,11 +0,0 @@ -[Container] -ContainerName=alertmanager -Image=quay.io/prometheus/alertmanager:v0.31.1 -Network=monitor.network -Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,Z - -[Service] -Restart=on-failure - -[Install] -WantedBy=default.target diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container deleted file mode 100644 index 3a83bfe..0000000 --- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/grafana.container +++ /dev/null @@ -1,14 +0,0 @@ -[Container] -ContainerName=grafana -Image=docker.io/grafana/grafana -Environment=GF_SERVER_DOMAIN={{ inventory_hostname }} -Environment=GF_SERVER_ROOT_URL=https://%%(domain)s/ -Network=frontend.network -Network=monitor.network -Volume=/home/{{ podman_prometheus_podman_rootless_user }}/grafana-data:/var/lib/grafana:rw,Z - -[Service] -Restart=on-failure - -[Install] -WantedBy=default.target diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network deleted file mode 100644 index 4029eb4..0000000 --- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/monitor.network +++ /dev/null @@ -1,2 +0,0 @@ -[Network] -NetworkName=network diff --git a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container index f5873f1..b10b545 100644 --- a/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container +++ b/roles/podman_prometheus/templates/home/podman/config/containers/systemd/prometheus.container @@ -1,11 +1,10 @@ [Container] ContainerName=prometheus -Image=quay.io/prometheus/prometheus:v3.9.1 -Network=monitor.network +Image=quay.io/prometheus/prometheus:v3.8.1 +Network=frontend.network Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus-data:/prometheus:rw,Z Volume=/home/{{ podman_prometheus_podman_rootless_user }}/file-configs:/file-configs:ro,Z Volume=/home/{{ podman_prometheus_podman_rootless_user }}/prometheus.yml:/etc/prometheus/prometheus.yml:ro,Z -Volume=/home/{{ podman_prometheus_podman_rootless_user }}/alert.rules.yml:/etc/alertmanager/alert.rules.yml:ro,Z [Service] Restart=on-failure diff --git a/roles/podman_prometheus/templates/home/podman/nginx.conf b/roles/podman_prometheus/templates/home/podman/nginx.conf index 1d1c9ae..a1a168f 100644 --- a/roles/podman_prometheus/templates/home/podman/nginx.conf +++ b/roles/podman_prometheus/templates/home/podman/nginx.conf @@ -21,9 +21,9 @@ server { } } -upstream grafana { - zone grafana_upstream 64k; - server grafana:3000 resolve; +upstream prometheus { + zone prometheus_upstream 64k; + server prometheus:9090 resolve; } server { @@ -37,7 +37,7 @@ server { ssl_certificate_key /etc/letsencrypt/live/{{ inventory_hostname }}/privkey.pem; add_header Strict-Transport-Security "max-age=31536000" always; - add_header Referrer-Policy origin always; # make sure outgoing links don't show the URL + add_header Referrer-Policy origin always; # make sure outgoing links don't show the URL to the Prometheus instance add_header X-Content-Type-Options "nosniff" always; add_header X-XSS-Protection "1; mode=block" always; @@ -47,6 +47,6 @@ server { proxy_set_header X-Forwarded-Proto $scheme; proxy_read_timeout 180; - proxy_pass http://grafana; + proxy_pass http://prometheus; } } diff --git a/roles/podman_prometheus/templates/home/podman/prometheus.yml b/roles/podman_prometheus/templates/home/podman/prometheus.yml index 4870f78..422646d 100644 --- a/roles/podman_prometheus/templates/home/podman/prometheus.yml +++ b/roles/podman_prometheus/templates/home/podman/prometheus.yml @@ -8,9 +8,6 @@ scrape_configs: scrape_interval: 5s static_configs: - targets: ['localhost:9090'] - - job_name: 'alertmanager' - static_configs: - - targets: ['alertmanager:9093'] - job_name: 'node' scrape_interval: 5s scheme: https @@ -28,22 +25,9 @@ scrape_configs: {% for host in groups['keycloak'] %} - '{{ host }}:9100' {% endfor %} -{% for host in groups['radius'] %} - - '{{ host }}:9100' -{% endfor %} {% for host in groups['generic'] %} - '{{ host }}:9100' {% endfor %} file_sd_configs: - files: - "/file-configs/*.yml" - -alerting: - alertmanagers: - - scheme: http - static_configs: - - targets: - - "alertmanager:9093" - -rule_files: - - "/etc/alertmanager/alert.rules.yml" diff --git a/roles/radius/defaults/main.yml b/roles/radius/defaults/main.yml deleted file mode 100644 index a9a6870..0000000 --- a/roles/radius/defaults/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -#radius_domain_name: -radius_wap_ipaddr: 0.0.0.0/0 -#radius_wap_secret: -radius_local_vlan: 1 -radius_guest_vlan: 3 diff --git a/roles/radius/handlers/main.yml b/roles/radius/handlers/main.yml deleted file mode 100644 index 515c4e1..0000000 --- a/roles/radius/handlers/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: Restart radiusd - ansible.builtin.systemd_service: - name: radiusd - state: restarted - become: true diff --git a/roles/radius/tasks/certs.yml b/roles/radius/tasks/certs.yml deleted file mode 100644 index bc8d26f..0000000 --- a/roles/radius/tasks/certs.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -- name: "Radius Certificates | PATCH | Install latest certbot" - ansible.builtin.dnf: - name: certbot - state: latest - update_cache: true - become: true - -- name: "Radius Certificates | AUDIT | Check for existing certificate expiry" - community.crypto.x509_certificate_info: - path: "/etc/letsencrypt/live/{{ inventory_hostname }}/cert.pem" - register: radius_certs_existing_cert - ignore_errors: true - become: true - -- name: "Radius Certificates | AUDIT | Calculate days until expiry" - ansible.builtin.set_fact: - radius_certs_days_until_expiry: "{{ ((radius_certs_existing_cert.not_after | to_datetime('%Y%m%d%H%M%SZ')) - now()).days }}" - when: radius_certs_existing_cert.not_after is defined - become: true - -- name: "Radius Certificates | AUDIT | Print days until expiry" - ansible.builtin.debug: - msg: "{{ radius_certs_days_until_expiry }}" - when: radius_certs_existing_cert.not_after is defined - become: true - -- name: "Radius Certificates | PATCH | Request a new or renewed certificate" - when: (radius_certs_existing_cert.failed) or (radius_certs_days_until_expiry | int < 30) - become: true - block: - - name: "Radius Certificates | AUDIT | Check httpd" - ansible.builtin.systemd_service: - name: httpd - register: radius_certs_httpd_status - - - name: "Radius Certificates | PATCH | Stop httpd" - ansible.builtin.systemd_service: - name: httpd - state: stopped - when: radius_certs_httpd_status.status.ActiveState == "active" - - - name: "Radius Certificates | PATCH | Add http service to firewall" - ansible.posix.firewalld: - service: http - state: enabled - - - name: "Radius Certificates | PATCH | Request new certificate" - ansible.builtin.command: - cmd: certbot certonly --standalone --preferred-challenges http --agree-tos -n -d {{ inventory_hostname }} --register-unsafely-without-email - when: radius_certs_existing_cert.failed - - - name: "Radius Certificates | PATCH | Renew existing certificate" - ansible.builtin.command: - cmd: certbot renew - when: not radius_certs_existing_cert.failed - - - name: "Radius Certificates | PATCH | Remove http service from firewall" - ansible.posix.firewalld: - service: http - state: disabled - - - name: "Radius Certificates | PATCH | Start httpd" - ansible.builtin.systemd_service: - name: httpd - state: started - when: radius_certs_httpd_status.status.ActiveState == "active" - -- name: Radius | PATCH | Allow radiusd access to certificates - ansible.builtin.copy: - src: /etc/letsencrypt/live/{{ inventory_hostname }}/{{ item }}.pem - dest: /etc/raddb/{{ item }}.pem - remote_src: true - owner: radiusd - group: radiusd - mode: "0640" - become: true - notify: Restart radiusd - with_items: - - privkey - - cert - - chain diff --git a/roles/radius/tasks/main.yml b/roles/radius/tasks/main.yml deleted file mode 100644 index 2a7753d..0000000 --- a/roles/radius/tasks/main.yml +++ /dev/null @@ -1,40 +0,0 @@ ---- -- name: Radius | PATCH | Obtain or freshen certificates - ansible.builtin.include_tasks: - file: certs.yml - -- name: Radius | PATCH | Install required packages - ansible.builtin.dnf: - name: freeradius - state: present - become: true - -- name: Radius | PATCH | Install FreeRADIUS configuration files - ansible.builtin.template: - src: etc/raddb/{{ item }} - dest: /etc/raddb/{{ item }} - owner: root - group: radiusd - mode: 0640 - become: true - with_items: - - mods-available/eap - - mods-available/linelog - - sites-available/default - - mods-available/inner-eap - - sites-available/inner-tunnel - - clients.conf - - proxy.conf - notify: - - Restart radiusd - -- name: Radius | PATCH | Install rsyslog configuration - ansible.builtin.template: - src: etc/rsyslog.d/radiusd.conf - dest: /etc/rsyslog.d/radiusd.conf - owner: root - group: root - mode: 0644 - become: true - notify: - - Reload rsyslog diff --git a/roles/radius/templates/etc/raddb/clients.conf b/roles/radius/templates/etc/raddb/clients.conf deleted file mode 100644 index 8d8b175..0000000 --- a/roles/radius/templates/etc/raddb/clients.conf +++ /dev/null @@ -1,22 +0,0 @@ -client eduroam_roaming0 { - ipaddr = roaming0.ja.net - secret = {{ radius_roaming0_secret }} - nastype = 'eduroam_flr' -} - -client eduroam_roaming1 { - ipaddr = roaming1.ja.net - secret = {{ radius_roaming1_secret }} - nastype = 'eduroam_flr' -} - -client eduroam_roaming2 { - ipaddr = roaming2.ja.net - secret = {{ radius_roaming2_secret }} - nastype = 'eduroam_flr' -} - -client wireless_access_points_mgmt { - ipaddr = {{ radius_wap_ipaddr }} - secret = {{ radius_wap_secret }} -} diff --git a/roles/radius/templates/etc/raddb/mods-available/eap b/roles/radius/templates/etc/raddb/mods-available/eap deleted file mode 100644 index e473f70..0000000 --- a/roles/radius/templates/etc/raddb/mods-available/eap +++ /dev/null @@ -1,52 +0,0 @@ -eap { - # The initial EAP type requested. Change this to peap if you're - # using peap, or tls if you're using EAP-TLS. - default_eap_type = ttls - - # The maximum time an EAP-Session can continue for - timer_expire = 60 - - # The maximum number of ongoing EAP sessions - max_sessions = ${max_requests} - - tls-config tls-common { - # The public certificate that your server will present - certificate_file = /etc/raddb/cert.pem - - # The private key for the public certificate - private_key_file = /etc/raddb/privkey.pem - - # The password to decrypt 'private_key_file' - #private_key_password = "" - - # The certificate of the authority that issued 'certificate_file' - ca_file = /etc/raddb/chain.pem - - # If your AP drops packets towards the client, try reducing this. - fragment_size = 1024 - - # When issuing client certificates embed the OCSP URL in the - # certificate if you want to be able to revoke them later. - ocsp { - enable = yes - override_cert_url = no - use_nonce = yes - } - } - - tls { - tls = tls-common - } - - ttls { - tls = tls-common - default_eap_type = mschapv2 - virtual_server = "eduroam-inner" - } - - peap { - tls = tls-common - default_eap_type = mschapv2 - virtual_server = "eduroam-inner" - } -} diff --git a/roles/radius/templates/etc/raddb/mods-available/inner-eap b/roles/radius/templates/etc/raddb/mods-available/inner-eap deleted file mode 100644 index 51b72f7..0000000 --- a/roles/radius/templates/etc/raddb/mods-available/inner-eap +++ /dev/null @@ -1,9 +0,0 @@ -eap inner-eap { - default_eap_type = mschapv2 - timer_expire = 60 - max_sessions = ${max_requests} - - mschapv2 { - send_error = yes - } -} diff --git a/roles/radius/templates/etc/raddb/mods-available/linelog b/roles/radius/templates/etc/raddb/mods-available/linelog deleted file mode 100644 index 52f862f..0000000 --- a/roles/radius/templates/etc/raddb/mods-available/linelog +++ /dev/null @@ -1,39 +0,0 @@ -linelog linelog_recv_request { - filename = syslog - syslog_facility = local0 - syslog_severity = debug - format = "action = Recv-Request, %{pairs:request:}" -} - -linelog linelog_send_accept { - filename = syslog - syslog_facility = local0 - syslog_severity = debug - format = "action = Send-Accept, %{pairs:request:}" -} - -linelog linelog_send_reject { - filename = syslog - syslog_facility = local0 - syslog_severity = debug - format = "action = Send-Reject, %{pairs:request:}" -} - -linelog linelog_send_proxy_request { - filename = syslog - syslog_facility = local0 - syslog_severity = debug - format = "action = Send-Proxy-Request, %{pairs:proxy-request:}" -} - -linelog linelog_recv_proxy_response { - filename = syslog - syslog_facility = local0 - syslog_severity = debug - reference = "messages.%{proxy-reply:Response-Packet-Type}" - messages { - Access-Accept = "action = Recv-Proxy-Accept, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}" - Access-Reject = "action = Recv-Proxy-Reject, User-Name = %{User-Name}, Calling-Station-Id = %{Calling-Station-Id}, %{pairs:proxy-reply:}" - Access-Challenge = "action = Recv-Proxy-Challenge, User-Name = %{User-Name}, Calling-Station-ID = %{Calling-Station-Id}, %{pairs:proxy-reply:}" - } -} \ No newline at end of file diff --git a/roles/radius/templates/etc/raddb/proxy.conf b/roles/radius/templates/etc/raddb/proxy.conf deleted file mode 100644 index 7a799f7..0000000 --- a/roles/radius/templates/etc/raddb/proxy.conf +++ /dev/null @@ -1,38 +0,0 @@ -home_server eduroam_roaming0 { - ipaddr = roaming0.ja.net - secret = {{ radius_roaming0_secret }} - status_check = status-server - response_window = 5 - check_interval = 10 - check_timeout = 5 -} - -home_server eduroam_roaming1 { - ipaddr = roaming1.ja.net - secret = {{ radius_roaming1_secret }} - status_check = status-server - response_window = 5 - check_interval = 10 - check_timeout = 5 -} - -home_server eduroam_roaming2 { - ipaddr = roaming2.ja.net - secret = {{ radius_roaming2_secret }} - status_check = status-server - response_window = 5 - check_interval = 10 - check_timeout = 5 -} - -home_server_pool eduroam_flr_pool { - type = keyed-balance - home_server = eduroam_roaming0 - home_server = eduroam_roaming1 - home_server = eduroam_roaming2 -} - -realm eduroam_flr { - auth_pool = eduroam_flr_pool - nostrip -} diff --git a/roles/radius/templates/etc/raddb/sites-available/default b/roles/radius/templates/etc/raddb/sites-available/default deleted file mode 100644 index b82568c..0000000 --- a/roles/radius/templates/etc/raddb/sites-available/default +++ /dev/null @@ -1,112 +0,0 @@ -# The domain users will add to their username to have their credentials -# routed to your institution. You will also need to register this -# and your RADIUS server addresses with your NRO. -operator_name = "{{ radius_domain }}" - -# The VLAN to assign eduroam visitors -eduroam_default_guest_vlan = "{{ radius_guest_vlan }}" - -# The VLAN to assign your students/staff -eduroam_default_local_vlan = "{{ radius_local_vlan }}" - -server eduroam { - listen { - type = auth - ipv4addr = * - ipv6addr = * - port = 1812 - } - - authorize { - # Log requests before we change them - linelog_recv_request - - # split_username_nai is a policy in the default distribution to - # split a username into username and domain. We reject user-name - # strings without domains, as they're not routable. - split_username_nai - if (noop || !&Stripped-User-Domain) { - reject - } - - # Send the request to the NRO for your region. - # The details of the FLRs (Federation Level RADIUS servers) - # are in proxy.conf. - # You can make this condition as complex as you like, to - # include additional subdomains just concatenate the conditions - # with &&. - if (&Stripped-User-Domain != "${operator_name}") { - update { - control:Load-Balance-Key := &Calling-Station-ID - control:Proxy-To-Realm := 'eduroam_flr' - - # Operator name (RFC 5580) identifies the network the - # request originated from. It's not absolutely necessary - # but it helps with debugging. - request:Operator-Name := "1${operator_name}" - } - return - } - - # If the EAP module returns 'ok' or 'updated', it means it has handled - # the request and we don't need to call any other modules in this - # section. - eap { - ok = return - updated = return - } - } - - pre-proxy { - attr_filter.pre-proxy - linelog_send_proxy_request - } - - post-proxy { - attr_filter.post-proxy - linelog_recv_proxy_response - } - - authenticate { - eap - } - - post-auth { - # To implement eduroam you must: - # - Use wireless access points or a controller which supports - # dynamic VLAN assignments. - # - Have that feature enabled. - # - Have the guest_vlan/local_vlan available to the controller, - # or to all your access points. - # eduroam user traffic *MUST* be segregated, this is *NOT* optional. - update reply { - Tunnel-Type := VLAN - Tunnel-Medium-Type := IEEE-802 - } - if (&control:Proxy-To-Realm) { - update reply { - Tunnel-Private-Group-ID = ${eduroam_default_guest_vlan} - } - } - else { - update reply { - Tunnel-Private-Group-ID = ${eduroam_default_local_vlan} - } - } - - # We're sending a response to one of OUR network devices for one of - # OUR users so provide it with the real user-identity. - if (&session-state:Stripped-User-Name) { - update reply { - User-Name := "%{session-state:Stripped-User-Name}@%{Stripped-User-Domain}" - } - } - - linelog_send_accept - - Post-Auth-Type REJECT { - attr_filter.access_reject - linelog_send_reject - } - } -} diff --git a/roles/radius/templates/etc/raddb/sites-available/inner-tunnel b/roles/radius/templates/etc/raddb/sites-available/inner-tunnel deleted file mode 100644 index 686bf97..0000000 --- a/roles/radius/templates/etc/raddb/sites-available/inner-tunnel +++ /dev/null @@ -1,73 +0,0 @@ -server eduroam-inner { - listen { - type = auth - ipaddr = * - ipv6addr = * - port = 18120 # Used for testing only. Requests proxied internally. - } - - authorize { - # The outer username is considered garabage for autz purposes, but - # the domain portion of the outer and inner identities must match. - split_username_nai - if (noop || (&Stripped-User-Domain && \ - (&outer.Stripped-User-Domain != &Stripped-User-Domain))) { - reject - } - - # Make the user's real identity available to anything that needs - # it in the outer server. - if (&outer.session-state:) - update { - &outer.session-state:Stripped-User-Name := &Stripped-User-Name - } - } - - # EAP for PEAPv0 (EAP-MSCHAPv2) - inner-eap { - ok = return - } - - # THIS IS SITE SPECIFIC - # - # The files module is *ONLY* used for testing. It lets you define - # credentials in a flat file, IT WILL NOT SCALE. - # - # - If you use OpenLDAP with salted password hashes you should - # call the 'ldap' module here and use EAP-TTLS-PAP as your EAP method. - # - If you use OpenLDAP with cleartext passwords you should - # call the 'ldap' module here and use EAP-TTLS or PEAPv0. - # - If you use an SQL DB with salted password hashes you should call - # the 'sql' module here and use EAP-TTLS-PAP as your EAP method. - # - If you use an SQL DB with cleartext passwords you should call - # the 'sql' module here and use EAP-TTLS or PEAPv0. - # - If you use Novell you should call the 'ldap' module here and - # set ``edir = yes`` in ``mods-available/ldap`` and use EAP-TTLS or - # PEAPv0. - # - If you use Active Directory, you don't need anything here (remove - # the call to files) but you'll need to follow this - # [guide](freeradius-active-directory-integration-howto) and use - # EAP-TTLS-PAP or PEAPv0. - # - If you're using EAP-TLS (i'm impressed!) remove the call to files. - # - # EAP-TTLS-PAP and PEAPv0 are equally secure/insecure depending on how the - # supplicant is configured. PEAPv0 has a slight edge in that you need to - # crack MSCHAPv2 to get the user's password (but this is not hard). - files - - pap - mschap - } - - authenticate { - inner-eap - mschap - pap - - # Comment pap above and uncomment the stanza below if you're using - # Active Directory; this will allow it to work with EAP-TTLS/PAP. - #Auth-Type pap { - # ntlm_auth - #} - } -} diff --git a/roles/radius/templates/etc/rsyslog.d/radiusd.conf b/roles/radius/templates/etc/rsyslog.d/radiusd.conf deleted file mode 100644 index 60e790c..0000000 --- a/roles/radius/templates/etc/rsyslog.d/radiusd.conf +++ /dev/null @@ -1 +0,0 @@ -local0.debug /var/log/radius_auth.log \ No newline at end of file