feat(*): initialize new Snix infrastructure

Co-Authored-By: edef <edef@edef.eu>
Co-Authored-by: Ryan Lahfa <raito@lix.systems>
Change-Id: Ica1cda177a236814de900f50a8a61d288f58f519
This commit is contained in:
Florian Klink 2025-01-06 01:06:47 +01:00
parent 067eff3427
commit a52ea3675c
124 changed files with 27723 additions and 1631 deletions

132
ops/modules/o11y/agent.nix Normal file
View file

@ -0,0 +1,132 @@
{ depot
, config
, lib
, ...
}:
let
cfg = config.infra.monitoring.grafana-agent;
inherit (lib) mkEnableOption mkOption mkIf types;
passwordAsCredential = "\${CREDENTIALS_DIRECTORY}/password";
in
{
options.infra.monitoring.grafana-agent = {
enable = (mkEnableOption "Grafana Agent") // { default = true; };
exporters = mkOption {
description = ''
Set of additional exporters to scrape.
The attribute name will be used as `job_name`
internally, which ends up exported as `job` label
on all metrics of that exporter.
'';
type = types.attrsOf (types.submodule ({ config, name, ... }: {
options.port = mkOption {
description = "Exporter port";
type = types.int;
};
options.bearerTokenFile = mkOption {
description = "File containing a bearer token";
type = types.nullOr types.path;
default = null;
};
options.scrapeConfig = mkOption {
description = "Prometheus scrape config";
type = types.attrs;
};
config.scrapeConfig = lib.mkMerge [{
job_name = name;
static_configs = [
{ targets = [ "localhost:${toString config.port}" ]; }
];
}
(lib.mkIf (config.bearerTokenFile != null) {
authorization.credentials_file = "\${CREDENTIALS_DIRECTORY}/${name}-bearer-token";
})];
options.secrets = mkOption {
description = "Secrets required for scrape config";
type = types.attrs;
internal = true;
default = { };
};
config.secrets = lib.mkIf (config.bearerTokenFile != null) {
"${name}-bearer-token" = config.bearerTokenFile;
};
}));
default = { };
};
};
config = mkIf cfg.enable {
age.secrets.grafana-agent-password.file = depot.ops.secrets."grafana-agent-password.age";
services.grafana-agent = {
enable = true;
credentials = lib.mkMerge ([{ password = config.age.secrets.grafana-agent-password.path; }] ++
lib.mapAttrsToList (name: value: value.secrets) config.infra.monitoring.grafana-agent.exporters);
settings = {
metrics = {
global.remote_write = [
{
url = "https://mimir.snix.dev/api/v1/push";
basic_auth = {
username = "promtail";
password_file = passwordAsCredential;
};
}
];
global.external_labels = {
hostname = config.networking.hostName;
};
configs = [
{
name = config.networking.hostName;
scrape_configs = lib.mapAttrsToList (name: value: value.scrapeConfig) config.infra.monitoring.grafana-agent.exporters;
}
];
};
# logs = {
# global.clients = [
# {
# url = "https://loki.forkos.org/loki/api/v1/push";
# basic_auth = {
# username = "promtail";
# password_file = passwordAsCredential;
# };
# }
# ];
# configs = [
# {
# name = "journald";
# scrape_configs = [
# {
# job_name = "system";
# journal = {
# max_age = "12h";
# labels = {
# job = "systemd-journal";
# host = config.networking.hostName;
# };
# };
# relabel_configs = [
# {
# source_labels = [ "__journal__systemd_unit" ];
# target_label = "unit";
# }
# ];
# }
# ];
# }
# ];
# positions_directory = "\${STATE_DIRECTORY}/positions";
# };
integrations.node_exporter.enable_collectors = [
"processes"
"systemd"
];
};
};
};
}

View file

@ -0,0 +1,20 @@
{ config, depot, ... }: {
imports = [
depot.third_party.alertmanager-irc-relay.module
];
services.alertmanager-irc-relay = {
enable = true;
settings = {
irc_host = "irc.hackint.org";
irc_port = 6697;
irc_nickname = "silentfox";
irc_channels = [
{ name = "#snix"; password = "$CHANNEL_PASSWORD"; }
];
};
environmentFiles = [
config.age.secrets.alertmanager-irc-relay-environment.path
];
};
}

View file

View file

@ -0,0 +1,148 @@
{ depot
, config
, lib
, ...
}:
let
cfg = config.services.depot.grafana;
inherit (lib) mkEnableOption mkIf;
in
{
options.services.depot.grafana.enable = mkEnableOption "Grafana frontend";
config = mkIf cfg.enable {
services = {
grafana = {
enable = true;
settings = {
server = {
domain = "status.snix.dev";
http_addr = "127.0.0.1";
http_port = 2342;
root_url = "https://status.snix.dev/";
};
database = {
type = "postgres";
user = "grafana";
host = "/run/postgresql";
};
"auth.anonymous" = {
enabled = true;
org_name = "Main Org.";
org_role = "Viewer";
};
"auth.generic_oauth" = {
enabled = true;
name = "snix SSO";
client_id = "grafana";
client_secret = "$__file{${config.age.secrets.grafana-oauth-secret.path}}";
auth_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/auth";
token_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/token";
api_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/userinfo";
login_attribute_path = "username";
email_attribute_path = "email";
name_attribute_path = "full_name";
scopes = [
"openid"
"profile"
"email"
"offline_access"
"roles"
];
allow_sign_up = true;
auto_login = true;
allow_assign_grafana_admin = true;
role_attribute_path = "contains(grafana_roles[*], 'Admin') && 'GrafanaAdmin' || contains(grafana_roles[*], 'Editor') && 'Editor' || 'Viewer'";
};
dashboards.default_home_dashboard_path = "${depot.ops.dashboards.node_exporter}";
feature_toggles.enable = "autoMigrateOldPanels newVizTooltips";
security.angular_support_enabled = false;
};
provision = {
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "default";
options.path = depot.ops.dashboards.all;
}
];
};
datasources.settings = {
apiVersion = 1;
datasources = [
{
name = "Mimir";
type = "prometheus";
uid = "mimir";
access = "proxy";
url = "http://mimir.snix.dev:9009/prometheus";
isDefault = true;
}
{
name = "Loki";
type = "loki";
uid = "loki";
access = "proxy";
url = "http://loki.snix.dev:9090/";
}
{
name = "Tempo";
type = "tempo";
uid = "tempo";
access = "proxy";
url = "http://tempo.snix.dev:9190";
jsonData.streamingEnabled.search = true;
}
{
name = "Mimir Alertmanager";
type = "alertmanager";
uid = "mimir-alertmanager";
access = "proxy";
url = "http://mimir.snix.dev:9009/";
jsonData = {
handleGrafanaManagedAlerts = true;
implementation = "mimir";
};
}
# {
# name = "Pyroscope";
# type = "grafana-pyroscope-datasource";
# uid = "pyroscope";
# access = "proxy";
# url = "http://127.0.0.1:4040";
# }
];
};
};
};
postgresql = {
ensureDatabases = [ "grafana" ];
ensureUsers = [
{
name = "grafana";
ensureDBOwnership = true;
}
];
};
};
infra.monitoring.grafana-agent.exporters.grafana.port = 2342;
};
}

90
ops/modules/o11y/loki.nix Normal file
View file

@ -0,0 +1,90 @@
{ config
, lib
, ...
}:
let
cfg = config.services.depot.loki;
inherit (lib) mkEnableOption mkIf;
in
{
options.services.depot.loki.enable = mkEnableOption "Loki storage";
config = mkIf cfg.enable {
services.loki = {
enable = true;
extraFlags = [ "--config.expand-env" ];
configuration = {
server = {
http_listen_port = 9090;
grpc_listen_port = 9096;
# 16M
grpc_server_max_recv_msg_size = 16777216;
grpc_server_max_send_msg_size = 16777216;
};
auth_enabled = false;
common = {
storage.s3 = {
endpoint = "fsn1.your-objectstorage.com";
region = "fsn1";
bucketnames = "snix-loki";
secret_access_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
access_key_id = "\${S3_KEY_ID}";
s3forcepathstyle = true;
};
ring = {
kvstore.store = "memberlist";
# TODO: Such a ugly hack.
instance_interface_names = [ "enp1s0" "lo" ];
};
replication_factor = 1;
};
memberlist = {
advertise_addr = "127.0.0.1";
cluster_label = "snix";
bind_port = 7947;
advertise_port = 7947;
};
storage_config.tsdb_shipper = {
active_index_directory = "/var/lib/loki/index";
cache_location = "/var/lib/loki/cache";
};
compactor = {
working_directory = "/var/lib/loki/compactor";
compaction_interval = "10m";
retention_enabled = true;
retention_delete_delay = "1s";
retention_delete_worker_count = 150;
delete_request_store = "filesystem";
};
limits_config.retention_period = "1w";
schema_config = {
configs = [
{
from = "2024-07-01";
store = "tsdb";
object_store = "s3";
schema = "v13";
index = {
prefix = "index_";
period = "24h";
};
}
];
};
};
};
systemd.services.loki.serviceConfig.EnvironmentFile = [ config.age.secrets.loki-environment.path ];
infra.monitoring.grafana-agent.exporters.loki.port = 9090;
};
}

123
ops/modules/o11y/mimir.nix Normal file
View file

@ -0,0 +1,123 @@
{ config
, lib
, pkgs
, ...
}:
let
cfg = config.services.depot.prometheus;
inherit (lib) mkEnableOption mkIf;
mimirPort = config.services.mimir.configuration.server.http_listen_port;
alerts = pkgs.runCommand "mimir-alerts-checked"
{
src = ./alerts;
nativeBuildInputs = with pkgs; [ prometheus.cli ];
} ''
promtool check rules $src/*
mkdir $out
cp -R $src $out/anonymous/
'';
in
{
options.services.depot.prometheus.enable = mkEnableOption "Prometheus scraper";
config = mkIf cfg.enable {
services.mimir = {
enable = true;
extraFlags = [ "--config.expand-env=true" ];
configuration = {
target = "all,alertmanager";
multitenancy_enabled = false;
common.storage = {
backend = "s3";
s3 = {
endpoint = "fsn1.your-objectstorage.com";
bucket_name = "snix-mimir";
secret_access_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
access_key_id = "\${S3_KEY_ID}";
};
};
# TODO: Such a ugly hack.
distributor.ring.instance_interface_names = [ "enp1s0" "lo" ];
ingester.ring.instance_interface_names = [ "enp1s0" "lo" ];
frontend.instance_interface_names = [ "enp1s0" "lo" ];
query_scheduler.ring.instance_interface_names = [ "enp1s0" "lo" ];
ruler.ring.instance_interface_names = [ "enp1s0" "lo" ];
compactor.sharding_ring.instance_interface_names = [ "enp1s0" "lo" ];
store_gateway.sharding_ring.instance_interface_names = [ "enp1s0" "lo" ];
memberlist = {
advertise_addr = "127.0.0.1";
cluster_label = "snix";
};
server = {
http_listen_port = 9009;
grpc_server_max_recv_msg_size = 104857600;
grpc_server_max_send_msg_size = 104857600;
grpc_server_max_concurrent_streams = 1000;
};
ingester.ring.replication_factor = 1;
distributor.instance_limits.max_ingestion_rate = 0; # unlimited
limits = {
ingestion_rate = 1000000; # can't set to unlimited :(
out_of_order_time_window = "12h";
max_global_series_per_user = 0; # unlimited
};
blocks_storage.backend = "s3";
ruler_storage = {
backend = "local";
local.directory = alerts;
};
alertmanager = {
sharding_ring = {
replication_factor = 1;
# TODO: hack
instance_interface_names = [ "enp1s0" ];
};
fallback_config_file = pkgs.writers.writeYAML "alertmanager.yaml" {
route = {
group_by = [ "alertname" ];
receiver = "irc";
};
receivers = [
{
name = "irc";
webhook_configs = [{
# Mimir can't expand environment variables in external config files,
# so work around it.
url_file = "/run/credentials/mimir.service/webhook-url";
}];
}
];
};
};
alertmanager_storage.backend = "filesystem";
ruler.alertmanager_url = "http://localhost:${toString mimirPort}/alertmanager";
};
};
systemd.services.mimir = {
# Mimir tries to determine its own IP address for gossip purposes,
# even when it's the only instance, and fails if it can't find one.
# Avoid that by ensuring it starts after the network is set up.
wants = [ "network-online.target" ];
after = [ "network-online.target" ];
serviceConfig = {
EnvironmentFile = [ config.age.secrets.mimir-environment.path ];
LoadCredential = [ "webhook-url:${config.age.secrets.mimir-webhook-url.path}" ];
};
};
infra.monitoring.grafana-agent.exporters.mimir.port = 9009;
};
}

View file

@ -0,0 +1,71 @@
{ config
, lib
, ...
}:
let
cfg = config.services.depot.tempo;
inherit (lib) mkEnableOption mkIf;
in
{
options.services.depot.tempo.enable = mkEnableOption "Tempo trace store";
config = mkIf cfg.enable {
services.tempo = {
enable = true;
extraFlags = [ "--config.expand-env=true" ];
settings = {
multitenancy_enabled = false;
stream_over_http_enabled = true;
server = {
http_listen_port = 9190;
grpc_listen_port = 9195;
};
distributor.receivers.otlp.protocols.http.endpoint = "127.0.0.1:4138";
# TODO: S3
storage.trace = {
backend = "s3";
s3 = {
endpoint = "fsn1.your-objectstorage.com";
bucket = "snix-tempo";
secret_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
access_key = "\${S3_KEY_ID}";
};
wal.path = "/var/lib/tempo/traces-wal";
};
metrics_generator.storage = {
path = "/var/lib/tempo/metrics-wal";
remote_write = [
{
url = "http://127.0.0.1:9009/api/v1/push";
}
];
};
overrides.defaults.metrics_generator.processors = [ "span-metrics" ];
};
};
systemd.services.tempo.serviceConfig.EnvironmentFile = [ config.age.secrets.tempo-environment.path ];
services.nginx = {
upstreams.tempo = {
servers."${config.services.tempo.settings.distributor.receivers.otlp.protocols.http.endpoint}" = { };
extraConfig = "keepalive 16;";
};
virtualHosts."tempo.snix.dev" = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://tempo";
basicAuthFile = config.age.secrets.metrics-push-htpasswd.path;
};
};
};
infra.monitoring.grafana-agent.exporters.tempo.port = 9190;
};
}