diff --git a/ops/builderball/README.md b/ops/builderball/README.md new file mode 100644 index 000000000..cefb03c88 --- /dev/null +++ b/ops/builderball/README.md @@ -0,0 +1,79 @@ +builderball +=========== + +*A friendly game between Nix caches.* + +Builderball acts as a Nix cache to Nix clients, but behind the scenes it +connects to a set of caches and redirects the client to the first available +cache for each `narinfo`. + +There are two primary use-cases for this: + +1. Fronting multiple different Nix caches (e.g. for round-robin load balancing, + or to serve multiple separate caches at one address). + +2. Distributing artifacts between multiple active Nix builders that connect to + each other to find already built artifacts. + +Builderball is tested with caches backed by +[Harmonia](https://github.com/nix-community/harmonia), but other caches (the +upstream binary cache, Cachix, etc.) should also work fine. + +TVL uses Builderball to have builders dynamically join the CI pool and +distribute intermediate outputs between each other. It does not, however, +concern itself with preventing concurrent builds of the same output. + +Builderball supports tag-based discovery of Nix caches on Tailscale networks. +TVL runs a [Headscale](https://headscale.net/) network for this purpose. + +## Requirements + +Builderball should run anywhere that Go can produce working binaries. It does, +however, impose several restrictions in order for the configuration to be valid: + +* All clients that can reach Builderball **must** be able to reach all + caches that it connects to under the **same** addresses. + + Builderball works by rewriting the first discovered `narinfo` for a given + store path, replacing its NAR URL with an absolute URL pointing towards the + address of that cache. If a client can connect to Builderball, but not to the + cache, it might end up receiving a `narinfo` with an unreachable URL. + +* *Either* all caches must respond correctly to the default `Host` header set + when using the addresses configured in/discovered by Builderball, *or* all + caches must respond to the **same** `Host` header configured with the + `-cache-host` flag. + +* All discovered caches **must** listen on the same port, configured by the + `-cache-port` flag. This restriction does not apply to statically configured + caches. + + +## Usage + +``` +Usage of ./builderball: + + -cache value + Upstream cache URL (can be specified multiple times) + -cache-host string + Host header to send to each binary cache + -cache-port int + port at which to connect to binary cache on each node (default 80) + -debug + whether debug logging should be enabled + -json + whether logging should be in JSON format + -host string + host on which to listen for incoming requests (default "localhost") + -port int + port on which to listen for incoming requests (default 2243) + -priority int + Nix cache priority with which to serve clients (default 50) + -tailscale + whether caches should be discovered on Tailscale + -tailscale-tag string + Tailscale tag to use for discovery (default "tag:nix-cache") + -ticker int + interval in seconds between statistics tickers (default 5) +``` diff --git a/ops/builderball/config/config.go b/ops/builderball/config/config.go new file mode 100644 index 000000000..07c82aa26 --- /dev/null +++ b/ops/builderball/config/config.go @@ -0,0 +1,42 @@ +package config + +import ( + "flag" + "strings" +) + +var ( + Host = flag.String("host", "localhost", "host on which to listen for incoming requests") + Port = flag.Int("port", 2243, "port on which to listen for incoming requests") + Debug = flag.Bool("debug", false, "whether debug logging should be enabled") + JSON = flag.Bool("json", false, "whether logging should be in JSON format") + Ticker = flag.Int("ticker", 5, "interval in seconds between statistics tickers") + Priority = flag.Int("priority", 50, "Nix cache priority with which to serve clients") + + Tailscale = flag.Bool("tailscale", false, "whether caches should be discovered on Tailscale") + TSTag = flag.String("tailscale-tag", "tag:nix-cache", "Tailscale tag to use for discovery") + + CachePort = flag.Int("cache-port", 80, "port at which to connect to binary cache on each node") + CacheHost = flag.String("cache-host", "", "Host header to send to each binary cache") + + Caches []string +) + +type stringSliceFlag []string + +func (s *stringSliceFlag) String() string { + if len(*s) == 0 { + return "[ ]" + } + + return "[ " + strings.Join(*s, " ") + " ]" +} + +func (s *stringSliceFlag) Set(value string) error { + *s = append(*s, value) + return nil +} + +func init() { + flag.Var((*stringSliceFlag)(&Caches), "cache", "Upstream cache URL (can be specified multiple times)") +} diff --git a/ops/builderball/default.nix b/ops/builderball/default.nix new file mode 100644 index 000000000..69dbc7027 --- /dev/null +++ b/ops/builderball/default.nix @@ -0,0 +1,8 @@ +{ depot, pkgs, ... }: + +pkgs.buildGoModule { + name = "builderball"; + src = depot.third_party.gitignoreSource ./.; + vendorHash = "sha256:1prdkm05bdbyinwwgrbwl8pazayg5cp61dlkmygxwbp880zxpqfm"; + meta.description = "Nix cache proxy which forwards clients to the first available cache"; +} diff --git a/ops/builderball/discovery/caches.go b/ops/builderball/discovery/caches.go new file mode 100644 index 000000000..285a143bb --- /dev/null +++ b/ops/builderball/discovery/caches.go @@ -0,0 +1,97 @@ +// Package discovery provides logic for discovering the current set of available +// caches through Tailscale tags. +package discovery + +import ( + "context" + "fmt" + "log/slog" + "math/rand" + "sync" + "time" + + "tailscale.com/client/tailscale" + + "tvl.fyi/ops/builderball/config" +) + +// GetCaches returns the currently known set of caches, updating it if required. +// +// If cached data is stale but an update fails, the stale data is returned +// anyways. There is a fairly high chance that one or more of the known caches +// are still alive in case of transient Tailscale issues. +func GetCaches() []string { + return caches.get() +} + +type cache struct { + lock sync.RWMutex + caches []string + updated time.Time +} + +var caches *cache = new(cache) + +func (c *cache) update() ([]string, error) { + c.lock.Lock() + defer c.lock.Unlock() + + found := make([]string, len(config.Caches)) + copy(found, config.Caches) + + if *config.Tailscale { + client := tailscale.LocalClient{} + status, err := client.Status(context.Background()) + if err != nil { + slog.Error("failed to get tailscale status", "error", err.Error()) + return nil, err + } + + for _, peer := range status.Peer { + if peer.Online && peer.Tags != nil && status.Self != peer && len(peer.TailscaleIPs) > 0 { + for _, tag := range peer.Tags.All() { + if tag == *config.TSTag { + ip := peer.TailscaleIPs[0].String() + slog.Debug("discovered cache on tailscale", "host", peer.HostName, "ip", ip) + found = append(found, fmt.Sprintf("http://%s:%d", ip, *config.CachePort)) + } + } + } + } + } + + // shuffle order of elements to avoid sending everything to the first + // configured one for popular packages + rand.Shuffle(len(found), func(i, j int) { + found[i], found[j] = found[j], found[i] + }) + + c.updated = time.Now() + c.caches = make([]string, len(found)) + copy(c.caches, found) + slog.Debug("updated discovered caches", "caches", found) + + return found, nil +} + +func (c *cache) get() []string { + c.lock.RLock() + cached := make([]string, len(c.caches)) + copy(cached, c.caches) + updated := c.updated + c.lock.RUnlock() + + if time.Since(updated) > 30*time.Second { + slog.Debug("discovery cache is stale; updating") + result, err := c.update() + if err != nil { + // return stale results; worth trying anyways + slog.Debug("returning stale discovery cache results") + return cached + } + + return result + } + + return cached +} diff --git a/ops/builderball/go.mod b/ops/builderball/go.mod new file mode 100644 index 000000000..f795e5df4 --- /dev/null +++ b/ops/builderball/go.mod @@ -0,0 +1,31 @@ +module tvl.fyi/ops/builderball + +go 1.23.4 + +require ( + filippo.io/edwards25519 v1.1.0 // indirect + github.com/akutz/memconn v0.1.0 // indirect + github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa // indirect + github.com/dblohm7/wingoes v0.0.0-20240119213807-a09d6be7affa // indirect + github.com/fxamacker/cbor/v2 v2.6.0 // indirect + github.com/go-json-experiment/json v0.0.0-20231102232822-2e55bd4e08b0 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/hdevalence/ed25519consensus v0.2.0 // indirect + github.com/josharian/native v1.1.1-0.20230202152459-5c7d0dd6ab86 // indirect + github.com/jsimonetti/rtnetlink v1.4.0 // indirect + github.com/mdlayher/netlink v1.7.2 // indirect + github.com/mdlayher/socket v0.5.0 // indirect + github.com/mitchellh/go-ps v1.0.0 // indirect + github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55 // indirect + github.com/x448/float16 v0.8.4 // indirect + go4.org/mem v0.0.0-20220726221520-4f986261bf13 // indirect + go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect + golang.org/x/crypto v0.25.0 // indirect + golang.org/x/exp v0.0.0-20240119083558-1b970713d09a // indirect + golang.org/x/net v0.27.0 // indirect + golang.org/x/sync v0.9.0 // indirect + golang.org/x/sys v0.27.0 // indirect + golang.org/x/text v0.16.0 // indirect + golang.zx2c4.com/wireguard/windows v0.5.3 // indirect + tailscale.com v1.78.3 // indirect +) diff --git a/ops/builderball/go.sum b/ops/builderball/go.sum new file mode 100644 index 000000000..8d204a7ad --- /dev/null +++ b/ops/builderball/go.sum @@ -0,0 +1,51 @@ +filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= +filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= +github.com/akutz/memconn v0.1.0 h1:NawI0TORU4hcOMsMr11g7vwlCdkYeLKXBcxWu2W/P8A= +github.com/akutz/memconn v0.1.0/go.mod h1:Jo8rI7m0NieZyLI5e2CDlRdRqRRB4S7Xp77ukDjH+Fw= +github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI= +github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= +github.com/dblohm7/wingoes v0.0.0-20240119213807-a09d6be7affa h1:h8TfIT1xc8FWbwwpmHn1J5i43Y0uZP97GqasGCzSRJk= +github.com/dblohm7/wingoes v0.0.0-20240119213807-a09d6be7affa/go.mod h1:Nx87SkVqTKd8UtT+xu7sM/l+LgXs6c0aHrlKusR+2EQ= +github.com/fxamacker/cbor/v2 v2.6.0 h1:sU6J2usfADwWlYDAFhZBQ6TnLFBHxgesMrQfQgk1tWA= +github.com/fxamacker/cbor/v2 v2.6.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-json-experiment/json v0.0.0-20231102232822-2e55bd4e08b0 h1:ymLjT4f35nQbASLnvxEde4XOBL+Sn7rFuV+FOJqkljg= +github.com/go-json-experiment/json v0.0.0-20231102232822-2e55bd4e08b0/go.mod h1:6daplAwHHGbUGib4990V3Il26O0OC4aRyvewaaAihaA= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/hdevalence/ed25519consensus v0.2.0 h1:37ICyZqdyj0lAZ8P4D1d1id3HqbbG1N3iBb1Tb4rdcU= +github.com/hdevalence/ed25519consensus v0.2.0/go.mod h1:w3BHWjwJbFU29IRHL1Iqkw3sus+7FctEyM4RqDxYNzo= +github.com/josharian/native v1.1.1-0.20230202152459-5c7d0dd6ab86 h1:elKwZS1OcdQ0WwEDBeqxKwb7WB62QX8bvZ/FJnVXIfk= +github.com/josharian/native v1.1.1-0.20230202152459-5c7d0dd6ab86/go.mod h1:aFAMtuldEgx/4q7iSGazk22+IcgvtiC+HIimFO9XlS8= +github.com/jsimonetti/rtnetlink v1.4.0 h1:Z1BF0fRgcETPEa0Kt0MRk3yV5+kF1FWTni6KUFKrq2I= +github.com/jsimonetti/rtnetlink v1.4.0/go.mod h1:5W1jDvWdnthFJ7fxYX1GMK07BUpI4oskfOqvPteYS6E= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.5.0 h1:ilICZmJcQz70vrWVes1MFera4jGiWNocSkykwwoy3XI= +github.com/mdlayher/socket v0.5.0/go.mod h1:WkcBFfvyG8QENs5+hfQPl1X6Jpd2yeLIYgrGFmJiJxI= +github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc= +github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg= +github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55 h1:Gzfnfk2TWrk8Jj4P4c1a3CtQyMaTVCznlkLZI++hok4= +github.com/tailscale/go-winio v0.0.0-20231025203758-c4f33415bf55/go.mod h1:4k4QO+dQ3R5FofL+SanAUZe+/QfeK0+OIuwDIRu2vSg= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +go4.org/mem v0.0.0-20220726221520-4f986261bf13 h1:CbZeCBZ0aZj8EfVgnqQcYZgf0lpZ3H9rmp5nkDTAst8= +go4.org/mem v0.0.0-20220726221520-4f986261bf13/go.mod h1:reUoABIJ9ikfM5sgtSF3Wushcza7+WeD01VB9Lirh3g= +go4.org/netipx v0.0.0-20231129151722-fdeea329fbba h1:0b9z3AuHCjxk0x/opv64kcgZLBseWJUpBw5I82+2U4M= +go4.org/netipx v0.0.0-20231129151722-fdeea329fbba/go.mod h1:PLyyIXexvUFg3Owu6p/WfdlivPbZJsZdgWZlrGope/Y= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= +golang.org/x/exp v0.0.0-20240119083558-1b970713d09a h1:Q8/wZp0KX97QFTc2ywcOE0YRjZPVIx+MXInMzdvQqcA= +golang.org/x/exp v0.0.0-20240119083558-1b970713d09a/go.mod h1:idGWGoKP1toJGkd5/ig9ZLuPcZBC3ewk7SzmH0uou08= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= +golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= +golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.4.1-0.20230131160137-e7d7f63158de/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.zx2c4.com/wireguard/windows v0.5.3 h1:On6j2Rpn3OEMXqBq00QEDC7bWSZrPIHKIus8eIuExIE= +golang.zx2c4.com/wireguard/windows v0.5.3/go.mod h1:9TEe8TJmtwyQebdFwAkEWOPr3prrtqm+REGFifP60hI= +tailscale.com v1.78.3 h1:2BJepIEYA0ba0ZXn2rOuZjYzIV4Az+X9RS5XJF007Ug= +tailscale.com v1.78.3/go.mod h1:gT7ALbLFCr2YIu0kgc9Q3tBVaTlod65D2N6jMLH11Bk= diff --git a/ops/builderball/main.go b/ops/builderball/main.go new file mode 100644 index 000000000..23938324f --- /dev/null +++ b/ops/builderball/main.go @@ -0,0 +1,75 @@ +package main + +import ( + "flag" + "fmt" + "log/slog" + "net/http" + "os" + "time" + + "tvl.fyi/ops/builderball/config" + "tvl.fyi/ops/builderball/proxy" +) + +func printStats() { + hits, misses := proxy.GetStats() + if hits > 0 || misses > 0 { + slog.Info("served cache requests", "hits", hits, "misses", misses) + } +} + +func main() { + flag.Parse() + slog.Info("launching builderball proxy", "host", *config.Host, "port", *config.Port) + + logConfig := slog.HandlerOptions{ + Level: slog.LevelInfo, + } + + if *config.Debug { + logConfig.Level = slog.LevelDebug + } + + if *config.JSON { + logConfig.AddSource = true + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stderr, &logConfig))) + } else { + slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &logConfig))) + } + + slog.Debug("debug logging enabled") // prints only then, of course. + + if len(config.Caches) > 0 { + slog.Info("static binary caches configured", "caches", config.Caches) + } + + if *config.Tailscale { + slog.Info("tailscale discovery is enabled", "tag", *config.TSTag) + } else if len(config.Caches) == 0 { + slog.Error("no static binary caches configured, and tailscale discovery is disabled") + os.Exit(1) + } + + http.HandleFunc("GET /nix-cache-info", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, `StoreDir: /nix/store +WantMassQuery: 1 +Priority: %d +`, *config.Priority) + }) + + http.HandleFunc("GET /", proxy.Handler) + + go func() { + for { + printStats() + time.Sleep(time.Duration(*config.Ticker) * time.Second) + } + }() + + err := http.ListenAndServe(fmt.Sprintf("%s:%d", *config.Host, *config.Port), nil) + if err != nil { + slog.Error("HTTP server failed", "error", err.Error()) + os.Exit(1) + } +} diff --git a/ops/builderball/proxy/proxy.go b/ops/builderball/proxy/proxy.go new file mode 100644 index 000000000..1e12fd5d3 --- /dev/null +++ b/ops/builderball/proxy/proxy.go @@ -0,0 +1,151 @@ +// Package proxy implements logic for proxying narinfo requests to upstream +// caches, and modifying their responses to let hosts fetch the required data +// directly from upstream. +package proxy + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "strings" + "sync/atomic" + "time" + + "tvl.fyi/ops/builderball/config" + "tvl.fyi/ops/builderball/discovery" +) + +var hits atomic.Uint64 +var misses atomic.Uint64 + +func GetStats() (uint64, uint64) { + return hits.Swap(0), misses.Swap(0) +} + +type narinfo struct { + body string + url string +} + +// fetchNarinfoWithAbsoluteURL contacts the cache at baseURL to see if it has +// the given NAR, and if so returns the narinfo with the URL pointing to the +// *absolute* address of the cache. Nix will follow the absolute URL for +// downloads. +func fetchNarinfoWithAbsoluteURL(ctx context.Context, r *http.Request, baseURL string) *narinfo { + url := baseURL + r.URL.Path + slog.Debug("querying upstream cache", "url", url) + + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + + if *config.CacheHost != "" { + req.Header.Add("Host", *config.CacheHost) + } + + if err != nil { + slog.Warn("could not create cache lookup request", "cache", baseURL, "error", err.Error()) + return nil + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + if errors.Is(err, context.Canceled) { + slog.Debug("cancelled lookup to cache", "url", baseURL) + } else if errors.Is(err, context.DeadlineExceeded) { + slog.Info("cache timed out", "cache", baseURL) + } else { + slog.Warn("could not query cache", "cache", baseURL, "error", err.Error()) + } + + return nil + } + + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + slog.Debug("upstream cache responded with non-OK status", "status", resp.Status) + return nil + } + + content, err := io.ReadAll(resp.Body) + if err != nil { + slog.Warn("could not read upstream response", "error", err.Error()) + return nil + } + + result := new(narinfo) + lines := strings.Split(string(content), "\n") + for i, line := range lines { + if strings.HasPrefix(line, "URL: ") { + result.url = baseURL + "/" + strings.TrimPrefix(line, "URL: ") + lines[i] = "URL: " + result.url + } + } + + result.body = strings.Join(lines, "\n") + + return result +} + +func findInCaches(r *http.Request, caches []string) *narinfo { + slog.Debug("querying caches", "caches", caches) + ctx, cancel := context.WithTimeout(r.Context(), 1*time.Second) + defer cancel() + + result := make(chan *narinfo, len(caches)) + + for _, cacheURL := range caches { + go func(baseURL string) { + result <- fetchNarinfoWithAbsoluteURL(ctx, r, baseURL) + }(cacheURL) + } + + remaining := len(caches) + for remaining > 0 { + select { + case <-ctx.Done(): + return nil + case r := <-result: + if r != nil { + return r + } + + remaining-- + } + } + + return nil +} + +func Handler(w http.ResponseWriter, r *http.Request) { + // Only handle narinfo requests + if !strings.HasSuffix(r.URL.Path, ".narinfo") { + slog.Warn("received non-narinfo request", "path", r.URL.Path) + http.NotFound(w, r) + return + } + + b := discovery.GetCaches() + if len(b) == 0 { + slog.Warn("no upstream caches available") + http.NotFound(w, r) + return + } + + narinfo := findInCaches(r, b) + if narinfo == nil { + misses.Add(1) + slog.Debug("no cache had store path", "path", r.URL.Path, "caches", b) + http.NotFound(w, r) + return + } + + slog.Debug("cache hit", "url", narinfo.url) + hits.Add(1) + + w.Header().Set("Content-Type", "text/x-nix-narinfo") + w.Header().Set("nix-link", narinfo.url) + fmt.Fprint(w, narinfo.body) +}