chore(ops): move nixos-tvix-cache to ops/machines

Change-Id: Id112f4e9ef195f9366a11f7b0dce326e7951fb49
Reviewed-on: https://cl.snix.dev/c/snix/+/30142
Tested-by: besadii
Reviewed-by: Ryan Lahfa <masterancpp@gmail.com>
Autosubmit: Florian Klink <flokli@flokli.de>
This commit is contained in:
Florian Klink 2025-03-19 10:53:07 +00:00 committed by clbot
parent 9caae9114e
commit c3de9e21eb
10 changed files with 9 additions and 17 deletions

View file

@ -9,4 +9,6 @@
build01
# Observability stack and internal software
meta01
# fetch-through cache for cache.nixos.org
snix-cache
])

View file

@ -0,0 +1 @@
zimbatm

View file

@ -0,0 +1,40 @@
# nixos-tvix-cache
This is a fetch-through mirror of cache.nixos.org, hosted by NumTide.
The current machine is a SX65 Hetzner dedicated server with 4x22TB SATA disks,
and 2x1TB NVMe disks.
The goals of this machine:
- Exercise snix-store and nar-bridge code
- Collect usage metrics (see [Grafana](https://nixos.tvix.store/grafana))
- Identify bottlenecks in the current implementations and fix them
- Replace cache.nixos.org?
You can configure this as a Nix substitutor on your systems like this:
```nix
nix.settings.substituters = [
"https://nixos.tvix.store"
];
```
For store paths it hasn't already seen yet, it'll internally ingest its contents
into snix-castore (deduplicating in doing so).
Requests for NARs will dynamically reassemble the NAR representation on demand.
Metadata and signatures are preserved (which is why you don't need to add
additional trusted keys).
We need to produce the same data bit by bit, else the signature check in your
Nix/Lix client would fail.
Be however aware that there's zero availability guarantees.
We will frequently redeploy this box, and it might become unavailable without
prior notice.
Snix currently doesn't have garbage collection. If we run out of disk space, we
might either move things to a bigger box or delete everything on it so far.
As it's only a cache, it should however re-ingest things again.

View file

@ -0,0 +1,78 @@
{ pkgs, lib, ... }: # readTree options
{ config, ... }: # passed by module system
let
srvos =
import (builtins.fetchTarball {
url = "https://github.com/nix-community/srvos/archive/15b152766b329dd2957549a49f0fd96a7a861db1.tar.gz";
sha256 = "sha256-11TCdlxJEf84Lm2KIJGL8J2nJ2G9CNTW8PrCebJLg/M=";
});
disko =
(builtins.fetchTarball {
url = "https://github.com/nix-community/disko/archive/84dd8eea9a06006d42b8af7cfd4fda4cf334db81.tar.gz";
sha256 = "13mfnjnjp21wms4mw35ar019775qgy3fnjc59zrpnqbkfmzyvv02";
});
in
{
imports = [
"${disko}/module.nix"
./disko.nix
./monitoring.nix
./nar-bridge.nix
srvos.nixosModules.hardware-hetzner-online-amd
srvos.nixosModules.mixins-nginx
];
options = {
machine.domain = lib.mkOption {
type = lib.types.str;
default = "nixos.tvix.store";
};
};
config = {
services.nginx.virtualHosts."${config.machine.domain}" = {
enableACME = true;
forceSSL = true;
};
security.acme.acceptTerms = true;
security.acme.defaults.email = "admin+acme@numtide.com";
nixpkgs.hostPlatform = lib.mkForce "x86_64-linux";
networking.hostName = "tvix-cache";
systemd.network.networks."10-uplink".networkConfig.Address = "2a01:4f9:3071:1091::2/64";
# Enable SSH and add some keys
services.openssh.enable = true;
users.users.root.openssh.authorizedKeys.keys = [
# edef
"cert-authority ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCvb/7ojfcbKvHIyjnrNUOOgzy44tCkgXY9HLuyFta1jQOE9pFIK19B4dR9bOglPKf145CCL0mSFJNNqmNwwavU2uRn+TQrW+U1dQAk8Gt+gh3O49YE854hwwyMU+xD6bIuUdfxPr+r5al/Ov5Km28ZMlHOs3FoAP0hInK+eAibioxL5rVJOtgicrOVCkGoXEgnuG+LRbOYTwzdClhRUxiPjK8alCbcJQ53AeZHO4G6w9wTr+W5ILCfvW4OmUXCX01sKzaBiQuuFCF6M/H4LlnsPWLMra2twXxkOIhZblwC+lncps9lQaUgiD4koZeOCORvHW00G0L39ilFbbnVcL6Itp/m8RRWm/xRxS4RMnsdV/AhvpRLrhL3lfQ7E2oCeSM36v1S9rdg6a47zcnpL+ahG76Gz39Y7KmVRQciNx7ezbwxj3Q5lZtFykgdfGIAN+bT8ijXMO6m68g60i9Bz4IoMZGkiJGqMYLTxMQ+oRgR3Ro5lbj7E11YBHyeimoBYXYGHMkiuxopQZ7lIj3plxIzhmUlXJBA4jMw9KGHdYaLhaicIYhvQmCTAjrkt2HvxEe6lU8iws2Qv+pB6tAGundN36RVVWAckeQPZ4ZsgDP8V2FfibZ1nsrQ+zBKqaslYMAHs01Cf0Hm0PnCqagf230xaobu0iooNuXx44QKoDnB+w== edef"
# flokli
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPTVTXOutUZZjXLB0lUSgeKcSY/8mxKkC0ingGK1whD2 flokli"
# mic92
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKbBp2dH2X3dcU1zh+xW3ZsdYROKpJd3n13ssOP092qE"
"ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCsjXKHCkpQT4LhWIdT0vDM/E/3tw/4KHTQcdJhyqPSH0FnwC8mfP2N9oHYFa2isw538kArd5ZMo5DD1ujL5dLk= ssh@secretive.Joergs-Laptop.local"
# padraic
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEFlro/QUDlDpaA1AQxdWIqBg9HSFJf9Cb7CPdsh0JN7"
# zimbatm
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOuiDoBOxgyer8vGcfAIbE6TC4n4jo8lhG9l01iJ0bZz zimbatm@no1"
"sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAINwWC6CJ/E6o3WGeZxbZMajC4roXnzVi8fOo1JYJSE6YAAAABHNzaDo= zimbatm@nixos"
];
environment.systemPackages = [
pkgs.helix
pkgs.htop
pkgs.kitty.terminfo
pkgs.tmux
];
system.stateVersion = "24.11";
};
}

View file

@ -0,0 +1,62 @@
# Disk /dev/nvme0n1: 1024 GB (=> 953 GiB)
# Disk /dev/nvme1n1: 1024 GB (=> 953 GiB)
# Disk /dev/sda: 22 TB (=> 20 TiB)
# Disk /dev/sdb: 22 TB (=> 20 TiB)
# Disk /dev/sdc: 22 TB (=> 20 TiB)
# Disk /dev/sdd: 22 TB (=> 20 TiB)
#
# # Installation
# 1. Comment out the fileSystems
# 2. Bootstrap the machine with `clan machines tvix-cache-install`
# 3. Do the btrfs partitioning by hand (because it's not supported by Disko)
# a. `mkfs.btrfs -m raid1 -d single /dev/sd{a,b,c,d} --label tank -f`
# b. `mkdir /tank && mount /dev/disk/by-label/tank /tank`
# 4. Uncomment the fileSystems section below
# 5. Re-deploy
#
# TODO: make use of /dev/nvme1n1
{
boot.loader.efi.canTouchEfiVariables = true;
boot.loader.systemd-boot.configurationLimit = 10;
boot.loader.systemd-boot.enable = true;
boot.loader.timeout = 3;
boot.supportedFilesystems = [ "btrfs" ];
# TODO: comment me during install
fileSystems."/tank" = {
fsType = "btrfs";
device = "/dev/disk/by-label/tank";
};
disko.devices = {
disk = {
main = {
type = "disk";
device = "/dev/nvme0n1";
content = {
type = "gpt";
partitions = {
ESP = {
size = "1G";
type = "EF00";
content = {
type = "filesystem";
format = "vfat";
mountpoint = "/boot";
mountOptions = [ "umask=0077" ];
};
};
root = {
size = "100%";
content = {
type = "filesystem";
format = "btrfs";
mountpoint = "/";
};
};
};
};
};
};
};
}

View file

@ -0,0 +1,184 @@
{ config, pkgs, ... }:
let
domain = config.machine.domain;
in
{
# Configure the NixOS machine with Grafana and Tempo to collect metrics from nar-bridge.
services.tempo = {
enable = true;
settings = {
auth_enabled = false;
server = {
http_listen_address = "127.0.0.1";
http_listen_port = 9080;
grpc_listen_address = "127.0.0.1";
grpc_listen_port = 9095;
grpc_server_max_recv_msg_size = 67108864;
grpc_server_max_send_msg_size = 67108864;
log_level = "warn";
};
# move the otlp listener to another port than 4317, and disable the 4318 one.
# opentelemetry-connector binds on both 4317 and 4318.
distributor.receivers.otlp.protocols = {
grpc.endpoint = "127.0.0.1:4319";
};
storage.trace = {
backend = "local";
wal.path = "/var/lib/tempo/wal";
local.path = "/var/lib/tempo/blocks";
};
usage_report.reporting_enabled = false;
# bump defaults
overrides.defaults.ingestion.max_traces_per_user = 10000 * 10;
overrides.defaults.global.max_bytes_per_trace = 500 * 1000 * 1000;
};
};
services.alloy.enable = true;
environment.etc."alloy/config.alloy".text = ''
// Accept OTLP. Forward metrics to mimir, and traces to tempo.
otelcol.receiver.otlp "main" {
grpc {
endpoint = "[::1]:4317"
}
http {
endpoint = "[::1]:4318"
}
output {
metrics = [otelcol.exporter.otlphttp.mimir.input]
traces = [otelcol.exporter.otlp.tempo.input]
}
}
// We push to Tempo over otlp-grpc.
otelcol.exporter.otlp "tempo" {
client {
endpoint = "127.0.0.1:4319"
tls {
insecure = true
}
}
}
// We push to Mimir over otlp-http.
otelcol.exporter.otlphttp "mimir" {
client {
endpoint = "http://localhost:9009/otlp"
}
}
// Run a bundled node-exporter.
prometheus.exporter.unix "main" { }
// Scrape it.
prometheus.scrape "main" {
targets = prometheus.exporter.unix.main.targets
forward_to = [otelcol.receiver.prometheus.default.receiver]
scrape_interval = "15s"
}
// Convert Prometheus metrics to OTLP and export them.
otelcol.receiver.prometheus "default" {
output {
metrics = [otelcol.exporter.otlphttp.mimir.input]
}
}
'';
services.mimir.enable = true;
services.mimir.configuration = {
server.grpc_listen_address = "127.0.0.1";
server.grpc_listen_port = 9096; # default 9095 conflicts with tempo
server.http_listen_address = "127.0.0.1";
server.http_listen_port = 9009;
multitenancy_enabled = false;
# https://github.com/grafana/mimir/discussions/8773
compactor.sharding_ring.instance_addr = "127.0.0.1";
distributor.ring.instance_addr = "127.0.0.1";
store_gateway.sharding_ring.instance_addr = "127.0.0.1";
ingester.ring.instance_addr = "127.0.0.1";
ingester.ring.replication_factor = 1;
memberlist.advertise_addr = "127.0.0.1";
};
services.grafana = {
enable = true;
settings = {
server = {
domain = domain;
http_addr = "127.0.0.1";
http_port = 3000;
root_url = "https://%(domain)s/grafana";
serve_from_sub_path = true;
};
analytics.reporting_enabled = false;
"auth.anonymous" = {
enabled = true;
};
auth.disable_login_form = true;
"auth.basic".enabled = false;
"auth.github" = {
enabled = true;
client_id = "Ov23liAnuBwzWtJJ7gv4";
client_secret = "$__file{/run/credentials/grafana.service/github_auth_client_secret}";
scopes = "user:email,read:org";
auth_url = "https://github.com/login/oauth/authorize";
token_url = "https://github.com/login/oauth/access_token";
api_url = "https://api.github.com/user";
allow_sign_up = true;
auto_login = false;
allowed_organizations = [ "numtide" ];
role_attribute_path = "contains(groups[*], '@numtide/network') && 'GrafanaAdmin' || 'Viewer'";
};
};
provision = {
enable = true;
datasources.settings.datasources = [
{
name = "Tempo";
type = "tempo";
uid = "traces";
url = "http://127.0.0.1:9080";
access = "proxy";
timeout = "300";
jsonData = {
nodeGraph.enabled = true;
# tracesToLogs.datasourceUid = "logs";
tracesToMetrics.datasourceUid = "metrics";
# serviceMap.datasourceUid = "metrics";
# nodeGraph.enabled = true;
# lokiSearch.datasourceUid = "logs";
};
}
{
name = "mimir";
type = "prometheus";
uid = "mimir";
url = "http://localhost:9009/prometheus";
jsonData = {
timeInterval = "15s";
};
}
];
};
};
systemd.services.grafana.serviceConfig.LoadCredential = "github_auth_client_secret:/etc/secrets/grafana_github_auth_client_secret";
services.nginx.virtualHosts."${domain}".locations."/grafana" = {
proxyPass = "http://localhost:3000";
proxyWebsockets = true;
};
}

View file

@ -0,0 +1,76 @@
{ config
, lib
, utils
, pkgs
, depot
, ...
}:
let
cfg = config.services.nar-bridge;
package = depot.snix.nar-bridge.override (old: {
features = old.features or [ "default" ] ++ [ "xp-store-composition-cli" ];
runTests = true;
});
storeCompositionFormat = pkgs.formats.toml { };
storeCompositionFile = storeCompositionFormat.generate "store-composition.toml" cfg.settings;
args = [
"--listen-address"
"sd-listen"
"--experimental-store-composition"
storeCompositionFile
];
in
{
options = {
services.nar-bridge = {
enable = lib.mkEnableOption "nar-bridge service";
settings = lib.mkOption {
type = storeCompositionFormat.type;
default = { };
};
};
};
config = lib.mkIf cfg.enable {
users.users.nar-bridge = {
isSystemUser = true;
group = "nar-bridge";
};
users.groups.nar-bridge = { };
systemd.sockets.nar-bridge = {
description = "nar-bridge socket";
wantedBy = [ "sockets.target" ];
socketConfig = {
LimitNOFILE = 65535;
ListenStream = "/run/nar-bridge.sock";
SocketMode = "0666";
SocketUser = "root";
};
};
systemd.services.nar-bridge = {
description = "NAR Bridge";
requires = [ "nar-bridge.socket" ];
after = [ "nar-bridge.socket" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = "${package}/bin/nar-bridge ${utils.escapeSystemdExecArgs args}";
Restart = "always";
RestartSec = "10";
User = "nar-bridge";
Group = "nar-bridge";
StateDirectory = "nar-bridge";
};
};
};
}

View file

@ -0,0 +1,125 @@
{ config, pkgs, ... }:
{
imports = [ ./nar-bridge-module.nix ];
# Microbenchmark
# hyperfine --warmup 1 'rm -rf /tmp/cache; nix copy --from https://nixos.tvix.store/ --to "file:///tmp/cache?compression=none" /nix/store/jlkypcf54nrh4n6r0l62ryx93z752hb2-firefox-132.0'
services.nginx = {
package = pkgs.nginxStable;
virtualHosts.${config.machine.domain} = {
locations."=/" = {
tryFiles = "$uri $uri/index.html =404";
root = pkgs.runCommand "index"
{
nativeBuildInputs = [ pkgs.markdown2html-converter ];
} ''
mkdir -p $out
markdown2html-converter ${./README.md} -o $out/index.html
'';
};
locations."/" = {
proxyPass = "http://unix:/run/nar-bridge.sock:/";
extraConfig = ''
# Sometimes it takes a while to download and unpack from upstream.
proxy_read_timeout 180s;
# Restrict allowed HTTP methods
limit_except GET HEAD {
# nar bridge allows to upload nars via PUT
deny all;
}
# Propagate content-encoding to the backend
proxy_set_header Accept-Encoding $http_accept_encoding;
# Enable proxy cache
proxy_cache nar-bridge;
proxy_cache_key "$scheme$proxy_host$request_uri";
proxy_cache_valid 200 301 302 10m; # Cache responses for 10 minutes
proxy_cache_valid 404 1m; # Cache 404 responses for 1 minute
proxy_cache_min_uses 2; # Cache only if the object is requested at least twice
proxy_cache_use_stale error timeout updating;
'';
};
};
# use more cores for compression
appendConfig = ''
worker_processes auto;
'';
proxyCachePath."nar-bridge" = {
enable = true;
levels = "1:2";
keysZoneName = "nar-bridge";
# Put our 1TB NVME to good use
maxSize = "200G";
inactive = "10d";
useTempPath = false;
};
};
services.nar-bridge = {
enable = true;
settings = {
blobservices = {
root = {
type = "objectstore";
object_store_url = "file:///tank/nar-bridge/blobs.object_store";
object_store_options = { };
};
};
directoryservices = {
root = {
type = "redb";
is_temporary = false;
path = "/var/lib/nar-bridge/directories.redb";
};
};
pathinfoservices = {
root = {
type = "cache";
near = "redb";
far = "cache-nixos-org";
};
redb = {
type = "redb";
is_temporary = false;
path = "/var/lib/nar-bridge/pathinfo.redb";
};
"cache-nixos-org" = {
type = "nix";
base_url = "https://cache.nixos.org";
blob_service = "root";
directory_service = "root";
public_keys = [
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
];
};
};
};
};
systemd.tmpfiles.rules = [
# Put the blobs on the big disk
"d /tank/nar-bridge 0755 nar-bridge nar-bridge -"
"d /tank/nar-bridge/blobs.object_store 0755 nar-bridge nar-bridge -"
# Cache responses on NVME
"d /var/cache/nginx 0755 ${config.services.nginx.user} ${config.services.nginx.group} -"
];
systemd.services.nar-bridge = {
unitConfig = {
# Keep most data on the SSD which is at /var/lib/nar-bridge, but bind-mount the blobs in
RequiresMountsFor = "/tank";
};
# twice the normal allowed limit, same as nix-daemon
serviceConfig.LimitNOFILE = "1048576";
};
}