Though netdata integrates smartd data it doesn't generate warnings like smartd does. It would be nice to have them go to IRC. The NixOS module for some reason has decided to implement its own very restricted notifications framework on top of the one that smartd provides dispatching to either mail (note that this is implemented in the NixOS module and doesn't use smartd's own support for this), wall(1) or some systemd mechanism. This is implemented in some shell script that can't be provided by the user. Luckily, the module is relatively small otherwise and we can easily inline the relevant service definitions and use our own script instead. Change-Id: I1e1ceff3c21a92ac42079c02813366671141b9b4 Reviewed-on: https://cl.tvl.fyi/c/depot/+/12969 Reviewed-by: sterni <sternenseemann@systemli.org> Autosubmit: sterni <sternenseemann@systemli.org> Tested-by: BuildkiteCI
189 lines
7.8 KiB
Nix
189 lines
7.8 KiB
Nix
{ pkgs, lib, config, depot, ... }:
|
|
|
|
let
|
|
ircChannel = "#sterni.lv";
|
|
irccatPort =
|
|
builtins.replaceStrings [ ":" ] [ "" ]
|
|
config.services.depot.irccat.config.tcp.listen;
|
|
|
|
send-irc-msg = pkgs.writeShellScript "send-irc-msg" ''
|
|
set -euo pipefail
|
|
printf '%s %s\n' ${lib.escapeShellArg ircChannel} "$1" | \
|
|
${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
|
|
'';
|
|
|
|
netdataPort = 19999;
|
|
in
|
|
|
|
{
|
|
imports = [
|
|
./http/nginx.nix
|
|
./irccat.nix
|
|
];
|
|
|
|
config = {
|
|
services.depot.irccat.config.irc.channels = [
|
|
ircChannel
|
|
];
|
|
|
|
# Since we have irccat we can wire up mdadm --monitor
|
|
boot.swraid.mdadmConf = ''
|
|
PROGRAM ${
|
|
pkgs.writeShellScript "mdmonitor-to-irc" ''
|
|
${send-irc-msg} "mdmonitor: $1($2''${3:+, $3})"
|
|
''
|
|
}
|
|
'';
|
|
|
|
# Based on nixos/modules/services/monitoring/smard.nix which has a much
|
|
# too specific smartd-notify.sh (and I'm too lazy to propose a redesign)
|
|
systemd.services.smartd = {
|
|
description = "S.M.A.R.T. Daemon";
|
|
wantedBy = [ "multi-user.target" ];
|
|
serviceConfig = {
|
|
Type = "notify";
|
|
ExecStart =
|
|
let
|
|
smartdNotify = pkgs.writeShellScript "smartd-notify.sh" ''
|
|
${send-irc-msg} "smartd: $SMARTD_FAILTYPE($SMARTD_DEVICE): $SMARTD_MESSAGE"
|
|
'';
|
|
smartdConf = pkgs.writeText "smartd.conf" (
|
|
# Short self test every day 03:00
|
|
# Long self test every tuesday 05:00
|
|
lib.concatMapStrings
|
|
(d: ''
|
|
${d} -m <nomailer> -M exec ${smartdNotify} -a -o on -s (S/../.././03|L/../../2/05)
|
|
'')
|
|
[ "DEFAULT" "DEVICESCAN" ]
|
|
);
|
|
in
|
|
lib.concatStringsSep " " [
|
|
"${pkgs.smartmontools}/sbin/smartd"
|
|
"-A"
|
|
"/var/log/smartd"
|
|
"--no-fork"
|
|
"--configfile=${smartdConf}"
|
|
];
|
|
};
|
|
};
|
|
|
|
services = {
|
|
netdata = {
|
|
enable = true;
|
|
config = {
|
|
logs = {
|
|
access = "syslog";
|
|
error = "syslog";
|
|
debug = "syslog";
|
|
health = "syslog";
|
|
collector = "syslog";
|
|
};
|
|
web = {
|
|
"default port" = toString netdataPort;
|
|
"bind to" = "localhost:${toString netdataPort}";
|
|
};
|
|
health = {
|
|
"script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
|
|
set -euo pipefail
|
|
|
|
# This humongous list is copied over from netdata's alarm-notify.sh
|
|
roles="''${1}" # the roles that should be notified for this event
|
|
args_host="''${2}" # the host generated this event
|
|
unique_id="''${3}" # the unique id of this event
|
|
alarm_id="''${4}" # the unique id of the alarm that generated this event
|
|
event_id="''${5}" # the incremental id of the event, for this alarm id
|
|
when="''${6}" # the timestamp this event occurred
|
|
name="''${7}" # the name of the alarm, as given in netdata health.d entries
|
|
chart="''${8}" # the name of the chart (type.id)
|
|
status="''${9}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
|
|
old_status="''${10}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
|
|
value="''${11}" # the current value of the alarm
|
|
old_value="''${12}" # the previous value of the alarm
|
|
src="''${13}" # the line number and file the alarm has been configured
|
|
duration="''${14}" # the duration in seconds of the previous alarm state
|
|
non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
|
|
units="''${16}" # the units of the value
|
|
info="''${17}" # a short description of the alarm
|
|
value_string="''${18}" # friendly value (with units)
|
|
# shellcheck disable=SC2034
|
|
# variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
|
|
old_value_string="''${19}" # friendly old value (with units), previously named "old_value_string"
|
|
calc_expression="''${20}" # contains the expression that was evaluated to trigger the alarm
|
|
calc_param_values="''${21}" # the values of the parameters in the expression, at the time of the evaluation
|
|
total_warnings="''${22}" # Total number of alarms in WARNING state
|
|
total_critical="''${23}" # Total number of alarms in CRITICAL state
|
|
total_warn_alarms="''${24}" # List of alarms in warning state
|
|
total_crit_alarms="''${25}" # List of alarms in critical state
|
|
classification="''${26}" # The class field from .conf files
|
|
edit_command_line="''${27}" # The command to edit the alarm, with the line number
|
|
child_machine_guid="''${28}" # the machine_guid of the child
|
|
transition_id="''${29}" # the transition_id of the alert
|
|
summary="''${30}" # the summary text field of the alert
|
|
|
|
# Verify that they haven't extended the arg list
|
|
ARG_COUNT_EXPECTED=30
|
|
|
|
if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
|
|
echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
|
|
fi
|
|
|
|
MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
|
|
|
|
# Filter rules by chart name. This is necessary, since the "enabled alarms"
|
|
# filter only allows for filtering alarm types, not specific alarms
|
|
# belonging to that alarm.
|
|
case "$chart" in
|
|
# netdata prefers the automatically assigned names (dm-<n>, md<n>,
|
|
# sd<c>) over ids for alerts, so this configuration assumes that
|
|
# we have two physical disks which we kind of assert using the
|
|
# grub configuration (it is more difficult with the soft raid
|
|
# config).
|
|
# ${assert builtins.length config.boot.loader.grub.devices == 2; ""}
|
|
disk_util.sda | disk_util.sdb | disk_backlog.sda | disk_backlog.sdb)
|
|
|
|
;;
|
|
disk_util.* | disk_backlog.*)
|
|
echo "$0: INFO: DISCARDING message: $MSG" >&2
|
|
exit 0
|
|
;;
|
|
*)
|
|
;;
|
|
esac
|
|
|
|
echo "$0: INFO: sending message: $MSG" >&2
|
|
${send-irc-msg} "$MSG"
|
|
'';
|
|
};
|
|
};
|
|
};
|
|
|
|
# https://learn.netdata.cloud/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/nginx
|
|
nginx.virtualHosts."monitoring.sterni.lv" = {
|
|
forceSSL = true;
|
|
enableACME = true;
|
|
extraConfig = ''
|
|
auth_basic "netdata";
|
|
auth_basic_user_file ${config.age.secretsDir}/netdata-htpasswd;
|
|
|
|
location / {
|
|
proxy_set_header X-Forwarded-Host $host;
|
|
proxy_set_header X-Forwarded-Server $host;
|
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|
proxy_pass http://127.0.0.1:${toString netdataPort};
|
|
proxy_http_version 1.1;
|
|
proxy_pass_request_headers on;
|
|
proxy_set_header Connection "keep-alive";
|
|
proxy_store off;
|
|
}
|
|
'';
|
|
};
|
|
};
|
|
|
|
age.secrets.netdata-htpasswd = {
|
|
file = depot.users.sterni.secrets."netdata-htpasswd.age";
|
|
inherit (config.services.nginx) group;
|
|
owner = config.services.nginx.user;
|
|
mode = "700";
|
|
};
|
|
};
|
|
}
|