feat(*): initialize new Snix infrastructure
Co-Authored-By: edef <edef@edef.eu> Co-Authored-by: Ryan Lahfa <raito@lix.systems> Change-Id: Ica1cda177a236814de900f50a8a61d288f58f519
This commit is contained in:
parent
067eff3427
commit
a52ea3675c
124 changed files with 27723 additions and 1631 deletions
132
ops/modules/o11y/agent.nix
Normal file
132
ops/modules/o11y/agent.nix
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
{ depot
|
||||
, config
|
||||
, lib
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
cfg = config.infra.monitoring.grafana-agent;
|
||||
inherit (lib) mkEnableOption mkOption mkIf types;
|
||||
passwordAsCredential = "\${CREDENTIALS_DIRECTORY}/password";
|
||||
in
|
||||
{
|
||||
options.infra.monitoring.grafana-agent = {
|
||||
enable = (mkEnableOption "Grafana Agent") // { default = true; };
|
||||
|
||||
exporters = mkOption {
|
||||
description = ''
|
||||
Set of additional exporters to scrape.
|
||||
|
||||
The attribute name will be used as `job_name`
|
||||
internally, which ends up exported as `job` label
|
||||
on all metrics of that exporter.
|
||||
'';
|
||||
type = types.attrsOf (types.submodule ({ config, name, ... }: {
|
||||
options.port = mkOption {
|
||||
description = "Exporter port";
|
||||
type = types.int;
|
||||
};
|
||||
options.bearerTokenFile = mkOption {
|
||||
description = "File containing a bearer token";
|
||||
type = types.nullOr types.path;
|
||||
default = null;
|
||||
};
|
||||
|
||||
options.scrapeConfig = mkOption {
|
||||
description = "Prometheus scrape config";
|
||||
type = types.attrs;
|
||||
};
|
||||
config.scrapeConfig = lib.mkMerge [{
|
||||
job_name = name;
|
||||
static_configs = [
|
||||
{ targets = [ "localhost:${toString config.port}" ]; }
|
||||
];
|
||||
}
|
||||
(lib.mkIf (config.bearerTokenFile != null) {
|
||||
authorization.credentials_file = "\${CREDENTIALS_DIRECTORY}/${name}-bearer-token";
|
||||
})];
|
||||
|
||||
options.secrets = mkOption {
|
||||
description = "Secrets required for scrape config";
|
||||
type = types.attrs;
|
||||
internal = true;
|
||||
default = { };
|
||||
};
|
||||
config.secrets = lib.mkIf (config.bearerTokenFile != null) {
|
||||
"${name}-bearer-token" = config.bearerTokenFile;
|
||||
};
|
||||
}));
|
||||
default = { };
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.grafana-agent-password.file = depot.ops.secrets."grafana-agent-password.age";
|
||||
|
||||
services.grafana-agent = {
|
||||
enable = true;
|
||||
credentials = lib.mkMerge ([{ password = config.age.secrets.grafana-agent-password.path; }] ++
|
||||
lib.mapAttrsToList (name: value: value.secrets) config.infra.monitoring.grafana-agent.exporters);
|
||||
settings = {
|
||||
metrics = {
|
||||
global.remote_write = [
|
||||
{
|
||||
url = "https://mimir.snix.dev/api/v1/push";
|
||||
basic_auth = {
|
||||
username = "promtail";
|
||||
password_file = passwordAsCredential;
|
||||
};
|
||||
}
|
||||
];
|
||||
global.external_labels = {
|
||||
hostname = config.networking.hostName;
|
||||
};
|
||||
configs = [
|
||||
{
|
||||
name = config.networking.hostName;
|
||||
scrape_configs = lib.mapAttrsToList (name: value: value.scrapeConfig) config.infra.monitoring.grafana-agent.exporters;
|
||||
}
|
||||
];
|
||||
};
|
||||
# logs = {
|
||||
# global.clients = [
|
||||
# {
|
||||
# url = "https://loki.forkos.org/loki/api/v1/push";
|
||||
# basic_auth = {
|
||||
# username = "promtail";
|
||||
# password_file = passwordAsCredential;
|
||||
# };
|
||||
# }
|
||||
# ];
|
||||
# configs = [
|
||||
# {
|
||||
# name = "journald";
|
||||
# scrape_configs = [
|
||||
# {
|
||||
# job_name = "system";
|
||||
# journal = {
|
||||
# max_age = "12h";
|
||||
# labels = {
|
||||
# job = "systemd-journal";
|
||||
# host = config.networking.hostName;
|
||||
# };
|
||||
# };
|
||||
# relabel_configs = [
|
||||
# {
|
||||
# source_labels = [ "__journal__systemd_unit" ];
|
||||
# target_label = "unit";
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# ];
|
||||
# positions_directory = "\${STATE_DIRECTORY}/positions";
|
||||
# };
|
||||
integrations.node_exporter.enable_collectors = [
|
||||
"processes"
|
||||
"systemd"
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
20
ops/modules/o11y/alertmanager-irc-relay.nix
Normal file
20
ops/modules/o11y/alertmanager-irc-relay.nix
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
{ config, depot, ... }: {
|
||||
imports = [
|
||||
depot.third_party.alertmanager-irc-relay.module
|
||||
];
|
||||
|
||||
services.alertmanager-irc-relay = {
|
||||
enable = true;
|
||||
settings = {
|
||||
irc_host = "irc.hackint.org";
|
||||
irc_port = 6697;
|
||||
irc_nickname = "silentfox";
|
||||
irc_channels = [
|
||||
{ name = "#snix"; password = "$CHANNEL_PASSWORD"; }
|
||||
];
|
||||
};
|
||||
environmentFiles = [
|
||||
config.age.secrets.alertmanager-irc-relay-environment.path
|
||||
];
|
||||
};
|
||||
}
|
||||
0
ops/modules/o11y/alerts/.gitkeep
Normal file
0
ops/modules/o11y/alerts/.gitkeep
Normal file
148
ops/modules/o11y/grafana.nix
Normal file
148
ops/modules/o11y/grafana.nix
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
{ depot
|
||||
, config
|
||||
, lib
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
cfg = config.services.depot.grafana;
|
||||
inherit (lib) mkEnableOption mkIf;
|
||||
in
|
||||
{
|
||||
options.services.depot.grafana.enable = mkEnableOption "Grafana frontend";
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
services = {
|
||||
grafana = {
|
||||
enable = true;
|
||||
|
||||
settings = {
|
||||
server = {
|
||||
domain = "status.snix.dev";
|
||||
http_addr = "127.0.0.1";
|
||||
http_port = 2342;
|
||||
root_url = "https://status.snix.dev/";
|
||||
};
|
||||
|
||||
database = {
|
||||
type = "postgres";
|
||||
user = "grafana";
|
||||
host = "/run/postgresql";
|
||||
};
|
||||
|
||||
"auth.anonymous" = {
|
||||
enabled = true;
|
||||
org_name = "Main Org.";
|
||||
org_role = "Viewer";
|
||||
};
|
||||
|
||||
"auth.generic_oauth" = {
|
||||
enabled = true;
|
||||
|
||||
name = "snix SSO";
|
||||
client_id = "grafana";
|
||||
client_secret = "$__file{${config.age.secrets.grafana-oauth-secret.path}}";
|
||||
|
||||
auth_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/auth";
|
||||
token_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/token";
|
||||
api_url = "https://auth.snix.dev/realms/snix-project/protocol/openid-connect/userinfo";
|
||||
|
||||
login_attribute_path = "username";
|
||||
email_attribute_path = "email";
|
||||
name_attribute_path = "full_name";
|
||||
|
||||
scopes = [
|
||||
"openid"
|
||||
"profile"
|
||||
"email"
|
||||
"offline_access"
|
||||
"roles"
|
||||
];
|
||||
|
||||
allow_sign_up = true;
|
||||
auto_login = true;
|
||||
allow_assign_grafana_admin = true;
|
||||
|
||||
role_attribute_path = "contains(grafana_roles[*], 'Admin') && 'GrafanaAdmin' || contains(grafana_roles[*], 'Editor') && 'Editor' || 'Viewer'";
|
||||
};
|
||||
|
||||
dashboards.default_home_dashboard_path = "${depot.ops.dashboards.node_exporter}";
|
||||
|
||||
feature_toggles.enable = "autoMigrateOldPanels newVizTooltips";
|
||||
security.angular_support_enabled = false;
|
||||
};
|
||||
|
||||
provision = {
|
||||
dashboards.settings = {
|
||||
apiVersion = 1;
|
||||
providers = [
|
||||
{
|
||||
name = "default";
|
||||
options.path = depot.ops.dashboards.all;
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
datasources.settings = {
|
||||
apiVersion = 1;
|
||||
datasources = [
|
||||
{
|
||||
name = "Mimir";
|
||||
type = "prometheus";
|
||||
uid = "mimir";
|
||||
access = "proxy";
|
||||
url = "http://mimir.snix.dev:9009/prometheus";
|
||||
isDefault = true;
|
||||
}
|
||||
{
|
||||
name = "Loki";
|
||||
type = "loki";
|
||||
uid = "loki";
|
||||
access = "proxy";
|
||||
url = "http://loki.snix.dev:9090/";
|
||||
}
|
||||
{
|
||||
name = "Tempo";
|
||||
type = "tempo";
|
||||
uid = "tempo";
|
||||
access = "proxy";
|
||||
url = "http://tempo.snix.dev:9190";
|
||||
jsonData.streamingEnabled.search = true;
|
||||
}
|
||||
{
|
||||
name = "Mimir Alertmanager";
|
||||
type = "alertmanager";
|
||||
uid = "mimir-alertmanager";
|
||||
access = "proxy";
|
||||
url = "http://mimir.snix.dev:9009/";
|
||||
jsonData = {
|
||||
handleGrafanaManagedAlerts = true;
|
||||
implementation = "mimir";
|
||||
};
|
||||
}
|
||||
|
||||
# {
|
||||
# name = "Pyroscope";
|
||||
# type = "grafana-pyroscope-datasource";
|
||||
# uid = "pyroscope";
|
||||
# access = "proxy";
|
||||
# url = "http://127.0.0.1:4040";
|
||||
# }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
postgresql = {
|
||||
ensureDatabases = [ "grafana" ];
|
||||
ensureUsers = [
|
||||
{
|
||||
name = "grafana";
|
||||
ensureDBOwnership = true;
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
infra.monitoring.grafana-agent.exporters.grafana.port = 2342;
|
||||
};
|
||||
}
|
||||
90
ops/modules/o11y/loki.nix
Normal file
90
ops/modules/o11y/loki.nix
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
{ config
|
||||
, lib
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
cfg = config.services.depot.loki;
|
||||
inherit (lib) mkEnableOption mkIf;
|
||||
in
|
||||
{
|
||||
options.services.depot.loki.enable = mkEnableOption "Loki storage";
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
services.loki = {
|
||||
enable = true;
|
||||
extraFlags = [ "--config.expand-env" ];
|
||||
|
||||
configuration = {
|
||||
server = {
|
||||
http_listen_port = 9090;
|
||||
grpc_listen_port = 9096;
|
||||
|
||||
# 16M
|
||||
grpc_server_max_recv_msg_size = 16777216;
|
||||
grpc_server_max_send_msg_size = 16777216;
|
||||
};
|
||||
|
||||
auth_enabled = false;
|
||||
|
||||
common = {
|
||||
storage.s3 = {
|
||||
endpoint = "fsn1.your-objectstorage.com";
|
||||
region = "fsn1";
|
||||
bucketnames = "snix-loki";
|
||||
secret_access_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
|
||||
access_key_id = "\${S3_KEY_ID}";
|
||||
s3forcepathstyle = true;
|
||||
};
|
||||
ring = {
|
||||
kvstore.store = "memberlist";
|
||||
# TODO: Such a ugly hack.
|
||||
instance_interface_names = [ "enp1s0" "lo" ];
|
||||
};
|
||||
replication_factor = 1;
|
||||
};
|
||||
|
||||
memberlist = {
|
||||
advertise_addr = "127.0.0.1";
|
||||
cluster_label = "snix";
|
||||
bind_port = 7947;
|
||||
advertise_port = 7947;
|
||||
};
|
||||
|
||||
storage_config.tsdb_shipper = {
|
||||
active_index_directory = "/var/lib/loki/index";
|
||||
cache_location = "/var/lib/loki/cache";
|
||||
};
|
||||
|
||||
compactor = {
|
||||
working_directory = "/var/lib/loki/compactor";
|
||||
compaction_interval = "10m";
|
||||
retention_enabled = true;
|
||||
retention_delete_delay = "1s";
|
||||
retention_delete_worker_count = 150;
|
||||
delete_request_store = "filesystem";
|
||||
};
|
||||
|
||||
limits_config.retention_period = "1w";
|
||||
|
||||
schema_config = {
|
||||
configs = [
|
||||
{
|
||||
from = "2024-07-01";
|
||||
store = "tsdb";
|
||||
object_store = "s3";
|
||||
schema = "v13";
|
||||
index = {
|
||||
prefix = "index_";
|
||||
period = "24h";
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.loki.serviceConfig.EnvironmentFile = [ config.age.secrets.loki-environment.path ];
|
||||
|
||||
infra.monitoring.grafana-agent.exporters.loki.port = 9090;
|
||||
};
|
||||
}
|
||||
123
ops/modules/o11y/mimir.nix
Normal file
123
ops/modules/o11y/mimir.nix
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
{ config
|
||||
, lib
|
||||
, pkgs
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
cfg = config.services.depot.prometheus;
|
||||
inherit (lib) mkEnableOption mkIf;
|
||||
|
||||
mimirPort = config.services.mimir.configuration.server.http_listen_port;
|
||||
|
||||
alerts = pkgs.runCommand "mimir-alerts-checked"
|
||||
{
|
||||
src = ./alerts;
|
||||
nativeBuildInputs = with pkgs; [ prometheus.cli ];
|
||||
} ''
|
||||
promtool check rules $src/*
|
||||
mkdir $out
|
||||
cp -R $src $out/anonymous/
|
||||
'';
|
||||
in
|
||||
{
|
||||
options.services.depot.prometheus.enable = mkEnableOption "Prometheus scraper";
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
services.mimir = {
|
||||
enable = true;
|
||||
extraFlags = [ "--config.expand-env=true" ];
|
||||
configuration = {
|
||||
target = "all,alertmanager";
|
||||
|
||||
multitenancy_enabled = false;
|
||||
|
||||
common.storage = {
|
||||
backend = "s3";
|
||||
s3 = {
|
||||
endpoint = "fsn1.your-objectstorage.com";
|
||||
bucket_name = "snix-mimir";
|
||||
secret_access_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
|
||||
access_key_id = "\${S3_KEY_ID}";
|
||||
};
|
||||
};
|
||||
|
||||
# TODO: Such a ugly hack.
|
||||
distributor.ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
ingester.ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
frontend.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
query_scheduler.ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
ruler.ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
compactor.sharding_ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
store_gateway.sharding_ring.instance_interface_names = [ "enp1s0" "lo" ];
|
||||
|
||||
memberlist = {
|
||||
advertise_addr = "127.0.0.1";
|
||||
cluster_label = "snix";
|
||||
};
|
||||
|
||||
server = {
|
||||
http_listen_port = 9009;
|
||||
grpc_server_max_recv_msg_size = 104857600;
|
||||
grpc_server_max_send_msg_size = 104857600;
|
||||
grpc_server_max_concurrent_streams = 1000;
|
||||
};
|
||||
|
||||
ingester.ring.replication_factor = 1;
|
||||
|
||||
distributor.instance_limits.max_ingestion_rate = 0; # unlimited
|
||||
limits = {
|
||||
ingestion_rate = 1000000; # can't set to unlimited :(
|
||||
out_of_order_time_window = "12h";
|
||||
max_global_series_per_user = 0; # unlimited
|
||||
};
|
||||
|
||||
blocks_storage.backend = "s3";
|
||||
ruler_storage = {
|
||||
backend = "local";
|
||||
local.directory = alerts;
|
||||
};
|
||||
|
||||
alertmanager = {
|
||||
sharding_ring = {
|
||||
replication_factor = 1;
|
||||
# TODO: hack
|
||||
instance_interface_names = [ "enp1s0" ];
|
||||
};
|
||||
fallback_config_file = pkgs.writers.writeYAML "alertmanager.yaml" {
|
||||
route = {
|
||||
group_by = [ "alertname" ];
|
||||
receiver = "irc";
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "irc";
|
||||
webhook_configs = [{
|
||||
# Mimir can't expand environment variables in external config files,
|
||||
# so work around it.
|
||||
url_file = "/run/credentials/mimir.service/webhook-url";
|
||||
}];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
alertmanager_storage.backend = "filesystem";
|
||||
|
||||
ruler.alertmanager_url = "http://localhost:${toString mimirPort}/alertmanager";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.mimir = {
|
||||
# Mimir tries to determine its own IP address for gossip purposes,
|
||||
# even when it's the only instance, and fails if it can't find one.
|
||||
# Avoid that by ensuring it starts after the network is set up.
|
||||
wants = [ "network-online.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
EnvironmentFile = [ config.age.secrets.mimir-environment.path ];
|
||||
LoadCredential = [ "webhook-url:${config.age.secrets.mimir-webhook-url.path}" ];
|
||||
};
|
||||
};
|
||||
|
||||
infra.monitoring.grafana-agent.exporters.mimir.port = 9009;
|
||||
};
|
||||
}
|
||||
71
ops/modules/o11y/tempo.nix
Normal file
71
ops/modules/o11y/tempo.nix
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
{ config
|
||||
, lib
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
cfg = config.services.depot.tempo;
|
||||
inherit (lib) mkEnableOption mkIf;
|
||||
in
|
||||
{
|
||||
options.services.depot.tempo.enable = mkEnableOption "Tempo trace store";
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
services.tempo = {
|
||||
enable = true;
|
||||
extraFlags = [ "--config.expand-env=true" ];
|
||||
settings = {
|
||||
multitenancy_enabled = false;
|
||||
stream_over_http_enabled = true;
|
||||
|
||||
server = {
|
||||
http_listen_port = 9190;
|
||||
grpc_listen_port = 9195;
|
||||
};
|
||||
distributor.receivers.otlp.protocols.http.endpoint = "127.0.0.1:4138";
|
||||
|
||||
# TODO: S3
|
||||
storage.trace = {
|
||||
backend = "s3";
|
||||
s3 = {
|
||||
endpoint = "fsn1.your-objectstorage.com";
|
||||
bucket = "snix-tempo";
|
||||
secret_key = "\${S3_KEY}"; # This is a secret injected via an environment variable
|
||||
access_key = "\${S3_KEY_ID}";
|
||||
};
|
||||
wal.path = "/var/lib/tempo/traces-wal";
|
||||
};
|
||||
|
||||
metrics_generator.storage = {
|
||||
path = "/var/lib/tempo/metrics-wal";
|
||||
remote_write = [
|
||||
{
|
||||
url = "http://127.0.0.1:9009/api/v1/push";
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
overrides.defaults.metrics_generator.processors = [ "span-metrics" ];
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.tempo.serviceConfig.EnvironmentFile = [ config.age.secrets.tempo-environment.path ];
|
||||
|
||||
services.nginx = {
|
||||
upstreams.tempo = {
|
||||
servers."${config.services.tempo.settings.distributor.receivers.otlp.protocols.http.endpoint}" = { };
|
||||
extraConfig = "keepalive 16;";
|
||||
};
|
||||
|
||||
virtualHosts."tempo.snix.dev" = {
|
||||
enableACME = true;
|
||||
forceSSL = true;
|
||||
locations."/" = {
|
||||
proxyPass = "http://tempo";
|
||||
basicAuthFile = config.age.secrets.metrics-push-htpasswd.path;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
infra.monitoring.grafana-agent.exporters.tempo.port = 9190;
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue