snix/users/sterni/machines/ingeborg/monitoring.nix
sterni 5dd66ab066 feat(sterni/ingeborg/monitoring): send smartd warnings to IRC
Though netdata integrates smartd data it doesn't generate warnings like
smartd does. It would be nice to have them go to IRC. The NixOS module
for some reason has decided to implement its own very restricted
notifications framework on top of the one that smartd provides
dispatching to either mail (note that this is implemented in the NixOS
module and doesn't use smartd's own support for this), wall(1) or some
systemd mechanism. This is implemented in some shell script that can't
be provided by the user.

Luckily, the module is relatively small otherwise and we can easily
inline the relevant service definitions and use our own script instead.

Change-Id: I1e1ceff3c21a92ac42079c02813366671141b9b4
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12969
Reviewed-by: sterni <sternenseemann@systemli.org>
Autosubmit: sterni <sternenseemann@systemli.org>
Tested-by: BuildkiteCI
2025-01-11 16:11:46 +00:00

189 lines
7.8 KiB
Nix

{ pkgs, lib, config, depot, ... }:
let
ircChannel = "#sterni.lv";
irccatPort =
builtins.replaceStrings [ ":" ] [ "" ]
config.services.depot.irccat.config.tcp.listen;
send-irc-msg = pkgs.writeShellScript "send-irc-msg" ''
set -euo pipefail
printf '%s %s\n' ${lib.escapeShellArg ircChannel} "$1" | \
${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
'';
netdataPort = 19999;
in
{
imports = [
./http/nginx.nix
./irccat.nix
];
config = {
services.depot.irccat.config.irc.channels = [
ircChannel
];
# Since we have irccat we can wire up mdadm --monitor
boot.swraid.mdadmConf = ''
PROGRAM ${
pkgs.writeShellScript "mdmonitor-to-irc" ''
${send-irc-msg} "mdmonitor: $1($2''${3:+, $3})"
''
}
'';
# Based on nixos/modules/services/monitoring/smard.nix which has a much
# too specific smartd-notify.sh (and I'm too lazy to propose a redesign)
systemd.services.smartd = {
description = "S.M.A.R.T. Daemon";
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "notify";
ExecStart =
let
smartdNotify = pkgs.writeShellScript "smartd-notify.sh" ''
${send-irc-msg} "smartd: $SMARTD_FAILTYPE($SMARTD_DEVICE): $SMARTD_MESSAGE"
'';
smartdConf = pkgs.writeText "smartd.conf" (
# Short self test every day 03:00
# Long self test every tuesday 05:00
lib.concatMapStrings
(d: ''
${d} -m <nomailer> -M exec ${smartdNotify} -a -o on -s (S/../.././03|L/../../2/05)
'')
[ "DEFAULT" "DEVICESCAN" ]
);
in
lib.concatStringsSep " " [
"${pkgs.smartmontools}/sbin/smartd"
"-A"
"/var/log/smartd"
"--no-fork"
"--configfile=${smartdConf}"
];
};
};
services = {
netdata = {
enable = true;
config = {
logs = {
access = "syslog";
error = "syslog";
debug = "syslog";
health = "syslog";
collector = "syslog";
};
web = {
"default port" = toString netdataPort;
"bind to" = "localhost:${toString netdataPort}";
};
health = {
"script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
set -euo pipefail
# This humongous list is copied over from netdata's alarm-notify.sh
roles="''${1}" # the roles that should be notified for this event
args_host="''${2}" # the host generated this event
unique_id="''${3}" # the unique id of this event
alarm_id="''${4}" # the unique id of the alarm that generated this event
event_id="''${5}" # the incremental id of the event, for this alarm id
when="''${6}" # the timestamp this event occurred
name="''${7}" # the name of the alarm, as given in netdata health.d entries
chart="''${8}" # the name of the chart (type.id)
status="''${9}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
old_status="''${10}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
value="''${11}" # the current value of the alarm
old_value="''${12}" # the previous value of the alarm
src="''${13}" # the line number and file the alarm has been configured
duration="''${14}" # the duration in seconds of the previous alarm state
non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
units="''${16}" # the units of the value
info="''${17}" # a short description of the alarm
value_string="''${18}" # friendly value (with units)
# shellcheck disable=SC2034
# variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
old_value_string="''${19}" # friendly old value (with units), previously named "old_value_string"
calc_expression="''${20}" # contains the expression that was evaluated to trigger the alarm
calc_param_values="''${21}" # the values of the parameters in the expression, at the time of the evaluation
total_warnings="''${22}" # Total number of alarms in WARNING state
total_critical="''${23}" # Total number of alarms in CRITICAL state
total_warn_alarms="''${24}" # List of alarms in warning state
total_crit_alarms="''${25}" # List of alarms in critical state
classification="''${26}" # The class field from .conf files
edit_command_line="''${27}" # The command to edit the alarm, with the line number
child_machine_guid="''${28}" # the machine_guid of the child
transition_id="''${29}" # the transition_id of the alert
summary="''${30}" # the summary text field of the alert
# Verify that they haven't extended the arg list
ARG_COUNT_EXPECTED=30
if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
fi
MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
# Filter rules by chart name. This is necessary, since the "enabled alarms"
# filter only allows for filtering alarm types, not specific alarms
# belonging to that alarm.
case "$chart" in
# netdata prefers the automatically assigned names (dm-<n>, md<n>,
# sd<c>) over ids for alerts, so this configuration assumes that
# we have two physical disks which we kind of assert using the
# grub configuration (it is more difficult with the soft raid
# config).
# ${assert builtins.length config.boot.loader.grub.devices == 2; ""}
disk_util.sda | disk_util.sdb | disk_backlog.sda | disk_backlog.sdb)
;;
disk_util.* | disk_backlog.*)
echo "$0: INFO: DISCARDING message: $MSG" >&2
exit 0
;;
*)
;;
esac
echo "$0: INFO: sending message: $MSG" >&2
${send-irc-msg} "$MSG"
'';
};
};
};
# https://learn.netdata.cloud/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/nginx
nginx.virtualHosts."monitoring.sterni.lv" = {
forceSSL = true;
enableACME = true;
extraConfig = ''
auth_basic "netdata";
auth_basic_user_file ${config.age.secretsDir}/netdata-htpasswd;
location / {
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-Server $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_pass http://127.0.0.1:${toString netdataPort};
proxy_http_version 1.1;
proxy_pass_request_headers on;
proxy_set_header Connection "keep-alive";
proxy_store off;
}
'';
};
};
age.secrets.netdata-htpasswd = {
file = depot.users.sterni.secrets."netdata-htpasswd.age";
inherit (config.services.nginx) group;
owner = config.services.nginx.user;
mode = "700";
};
};
}