chore(users/edef): move to contrib

Change-Id: I1a6972fab8ada26917f29607fc401e376d634070
2025-03-17 12:41:31 +00:00 · 2025-03-17 12:41:31 +00:00 · 403d8fc897
commit 403d8fc897
parent a7916624dc
55 changed files with 15 additions and 17 deletions
--- a/contrib/.gitignore
+++ b/contrib/.gitignore
@ -0,0 +1 @@
+target
--- a/contrib/OWNERS
+++ b/contrib/OWNERS
@ -0,0 +1,3 @@
+set noparent
+
+edef
--- a/contrib/crunch-v2/.gitignore
+++ b/contrib/crunch-v2/.gitignore
@ -0,0 +1 @@
+*.parquet
--- a/contrib/crunch-v2/Cargo.lock
+++ b/contrib/crunch-v2/Cargo.lock
--- a/contrib/crunch-v2/Cargo.nix
+++ b/contrib/crunch-v2/Cargo.nix
--- a/contrib/crunch-v2/Cargo.toml
+++ b/contrib/crunch-v2/Cargo.toml
@ -0,0 +1,39 @@
+[package]
+name = "crunch-v2"
+version = "0.1.0"
+edition = "2021"
+
+[workspace]
+members = ["."]
+
+[dependencies]
+anyhow = { version = "1.0.75", features = ["backtrace"] }
+lazy_static = "1.4.0"
+
+bstr = "1.8.0"
+bytes = "1.6.1"
+
+futures = "0.3.29"
+tokio = { version = "1.37.0", features = ["full"] }
+
+rusoto_core = { version = "0.48.0", default-features = false, features = ["hyper-rustls"] }
+rusoto_s3 = { version = "0.48.0", default-features = false, features = ["rustls"] }
+
+nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
+sled = "0.34.7"
+
+fastcdc = "3.1.0"
+blake3 = "1.5.0"
+sha2 = { version = "0.10.8", features = ["asm"] }
+digest = "0.10.7"
+
+bzip2 = "0.4.4"
+xz2 = "0.1.7"
+zstd = "0.13.0"
+prost = "0.12.2"
+polars = { version = "0.35.4", default-features = false, features = ["parquet", "lazy", "sql", "dtype-struct"] }
+indicatif = "0.17.7"
+clap = { version = "4.4.18", features = ["derive"] }
+
+[build-dependencies]
+prost-build = "0.12.2"
--- a/contrib/crunch-v2/OWNERS
+++ b/contrib/crunch-v2/OWNERS
@ -0,0 +1 @@
+edef
--- a/contrib/crunch-v2/build.rs
+++ b/contrib/crunch-v2/build.rs
@ -0,0 +1,6 @@
+use std::io::Result;
+
+fn main() -> Result<()> {
+    prost_build::compile_protos(&["protos/flatstore.proto"], &["protos/"])?;
+    Ok(())
+}
--- a/contrib/crunch-v2/default.nix
+++ b/contrib/crunch-v2/default.nix
@ -0,0 +1,15 @@
+{ pkgs, depot, lib, ... }:
+
+(pkgs.callPackage ./Cargo.nix {
+  defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
+    crunch-v2 = prev: {
+      src = depot.snix.utils.filterRustCrateSrc rec {
+        root = prev.src.origSrc;
+        extraFileset = lib.fileset.fileFilter (f: f.hasExt "proto") root;
+      };
+      nativeBuildInputs = [ pkgs.protobuf ];
+    };
+  };
+}).rootCrate.build.overrideAttrs {
+  meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
+}
--- a/contrib/crunch-v2/protos/flatstore.proto
+++ b/contrib/crunch-v2/protos/flatstore.proto
@ -0,0 +1,38 @@
+syntax = "proto3";
+
+package snix.flatstore.v1;
+
+message Path {
+    bytes nar_hash = 1;
+
+    oneof node {
+        DirectoryNode directory = 2;
+        FileNode file = 3;
+        SymlinkNode symlink = 4;
+    }
+}
+
+message DirectoryNode {
+    bytes name = 1;
+    repeated DirectoryNode directories = 2;
+    repeated FileNode files = 3;
+    repeated SymlinkNode symlinks = 4;
+}
+
+message FileNode {
+    bytes name = 1;
+    bytes hash = 2;
+    repeated Chunk chunks = 3;
+    bool executable = 4;
+}
+
+message Chunk {
+    bytes hash = 1;
+    uint32 size = 2;
+    uint32 size_compressed = 3;
+}
+
+message SymlinkNode {
+    bytes name = 1;
+    bytes target = 2;
+}
--- a/contrib/crunch-v2/src/bin/extract.rs
+++ b/contrib/crunch-v2/src/bin/extract.rs
@ -0,0 +1,155 @@
+//! This tool lossily converts a Sled database produced by crunch-v2 into a Parquet file for analysis.
+//! The resulting `crunch.parquet` has columns file_hash`, `nar_hash`, and `chunk`.
+//! The first two are SHA-256 hashes of the compressed file and the NAR it decompresses to.
+//! `chunk` is a struct array corresponding to [crunch_v2::proto::Chunk] messages.
+//! They are concatenated without any additional structure, so nothing but the chunk list is preserved.
+
+use anyhow::Result;
+use clap::Parser;
+use indicatif::{ProgressBar, ProgressStyle};
+use std::fs::File;
+use std::path::PathBuf;
+
+use crunch_v2::proto::{self, path::Node};
+use prost::Message;
+
+use polars::{
+    chunked_array::builder::AnonymousOwnedListBuilder,
+    prelude::{
+        df, BinaryChunkedBuilder, ChunkedBuilder, DataFrame, DataType, Field, ListBuilderTrait,
+        NamedFrom, ParquetWriter, PrimitiveChunkedBuilder, Series, UInt32Type,
+    },
+    series::IntoSeries,
+};
+
+#[derive(Parser)]
+struct Args {
+    /// Path to the sled database that's read from.
+    #[clap(default_value = "crunch.db")]
+    infile: PathBuf,
+
+    /// Path to the resulting parquet file that's written.
+    #[clap(default_value = "crunch.parquet")]
+    outfile: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let w = ParquetWriter::new(File::create(args.outfile)?);
+
+    let db: sled::Db = sled::open(&args.infile).unwrap();
+    let files_tree: sled::Tree = db.open_tree("files").unwrap();
+
+    let progress =
+        ProgressBar::new(files_tree.len() as u64).with_style(ProgressStyle::with_template(
+            "{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
+        )?);
+
+    let mut frame = FrameBuilder::new();
+    for entry in &files_tree {
+        let (file_hash, pb) = entry?;
+        frame.push(
+            file_hash[..].try_into().unwrap(),
+            proto::Path::decode(&pb[..])?,
+        );
+        progress.inc(1);
+    }
+
+    w.finish(&mut frame.finish())?;
+
+    Ok(())
+}
+
+struct FrameBuilder {
+    file_hash: BinaryChunkedBuilder,
+    nar_hash: BinaryChunkedBuilder,
+    chunk: AnonymousOwnedListBuilder,
+}
+
+impl FrameBuilder {
+    fn new() -> Self {
+        Self {
+            file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
+            nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
+            chunk: AnonymousOwnedListBuilder::new(
+                "chunk",
+                0,
+                Some(DataType::Struct(vec![
+                    Field::new("hash", DataType::Binary),
+                    Field::new("size", DataType::UInt32),
+                    Field::new("size_compressed", DataType::UInt32),
+                ])),
+            ),
+        }
+    }
+
+    fn push(&mut self, file_hash: [u8; 32], pb: proto::Path) {
+        self.file_hash.append_value(&file_hash[..]);
+        self.nar_hash.append_value(pb.nar_hash);
+        self.chunk
+            .append_series(&ChunkFrameBuilder::new(pb.node.unwrap()))
+            .unwrap();
+    }
+
+    fn finish(mut self) -> DataFrame {
+        df! {
+            "file_hash" => self.file_hash.finish().into_series(),
+            "nar_hash" => self.nar_hash.finish().into_series(),
+            "chunk" => self.chunk.finish().into_series()
+        }
+        .unwrap()
+    }
+}
+
+struct ChunkFrameBuilder {
+    hash: BinaryChunkedBuilder,
+    size: PrimitiveChunkedBuilder<UInt32Type>,
+    size_compressed: PrimitiveChunkedBuilder<UInt32Type>,
+}
+
+impl ChunkFrameBuilder {
+    fn new(node: proto::path::Node) -> Series {
+        let mut this = Self {
+            hash: BinaryChunkedBuilder::new("hash", 0, 0),
+            size: PrimitiveChunkedBuilder::new("size", 0),
+            size_compressed: PrimitiveChunkedBuilder::new("size_compressed", 0),
+        };
+
+        this.push(node);
+        this.finish()
+    }
+
+    fn push(&mut self, node: Node) {
+        match node {
+            Node::Directory(node) => {
+                for node in node.files {
+                    self.push(Node::File(node));
+                }
+
+                for node in node.directories {
+                    self.push(Node::Directory(node));
+                }
+            }
+            Node::File(node) => {
+                for chunk in node.chunks {
+                    self.hash.append_value(&chunk.hash);
+                    self.size.append_value(chunk.size);
+                    self.size_compressed.append_value(chunk.size_compressed);
+                }
+            }
+            Node::Symlink(_) => {}
+        }
+    }
+
+    fn finish(self) -> Series {
+        df! {
+            "hash" => self.hash.finish().into_series(),
+            "size" => self.size.finish().into_series(),
+            "size_compressed" => self.size_compressed.finish().into_series()
+        }
+        .unwrap()
+        .into_struct("chunk")
+        .into_series()
+    }
+}
--- a/contrib/crunch-v2/src/lib.rs
+++ b/contrib/crunch-v2/src/lib.rs
@ -0,0 +1,3 @@
+pub mod proto {
+    include!(concat!(env!("OUT_DIR"), "/snix.flatstore.v1.rs"));
+}
--- a/contrib/crunch-v2/src/main.rs
+++ b/contrib/crunch-v2/src/main.rs
@ -0,0 +1,309 @@
+//! This is a tool for ingesting subsets of cache.nixos.org into its own flattened castore format.
+//! Currently, produced chunks are not preserved, and this purely serves as a way of measuring
+//! compression/deduplication ratios for various chunking and compression parameters.
+//!
+//! NARs to be ingested are read from `ingest.parquet`, and filtered by an SQL expression provided as a program argument.
+//! The `file_hash` column should contain SHA-256 hashes of the compressed data, corresponding to the `FileHash` narinfo field.
+//! The `compression` column should contain either `"bzip2"` or `"xz"`, corresponding to the `Compression` narinfo field.
+//! Additional columns are ignored, but can be used by the SQL filter expression.
+//!
+//! flatstore protobufs are written to a sled database named `crunch.db`, addressed by file hash.
+
+use crunch_v2::proto;
+
+mod remote;
+
+use anyhow::Result;
+use clap::Parser;
+use futures::{stream, StreamExt, TryStreamExt};
+use indicatif::{ProgressBar, ProgressStyle};
+use std::{
+    io::{self, BufRead, Read, Write},
+    path::PathBuf,
+    ptr,
+};
+
+use polars::{
+    prelude::{col, LazyFrame, ScanArgsParquet},
+    sql::sql_expr,
+};
+
+use fastcdc::v2020::{ChunkData, StreamCDC};
+use nix_compat::nar::reader as nar;
+
+use digest::Digest;
+use prost::Message;
+use sha2::Sha256;
+
+#[derive(Parser)]
+struct Args {
+    /// Path to an existing parquet file.
+    /// The `file_hash` column should contain SHA-256 hashes of the compressed
+    /// data, corresponding to the `FileHash` narinfo field.
+    /// The `compression` column should contain either `"bzip2"` or `"xz"`,
+    /// corresponding to the `Compression` narinfo field.
+    /// Additional columns are ignored, but can be used by the SQL filter expression.
+    #[clap(long, default_value = "ingest.parquet")]
+    infile: PathBuf,
+
+    /// Filter expression to filter elements in the parquet file for.
+    filter: String,
+
+    /// Average chunk size for FastCDC, in KiB.
+    /// min value is half, max value double of that number.
+    #[clap(long, default_value_t = 256)]
+    avg_chunk_size: u32,
+
+    /// Path to the sled database where results are written to (flatstore
+    /// protobufs, addressed by file hash).
+    #[clap(long, default_value = "crunch.db")]
+    outfile: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let filter = sql_expr(args.filter)?;
+    let avg_chunk_size = args.avg_chunk_size * 1024;
+
+    let df = LazyFrame::scan_parquet(&args.infile, ScanArgsParquet::default())?
+        .filter(filter)
+        .select([col("file_hash"), col("compression")])
+        .drop_nulls(None)
+        .collect()?;
+
+    let progress = ProgressBar::new(df.height() as u64).with_style(ProgressStyle::with_template(
+        "{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
+    )?);
+
+    let file_hash = df
+        .column("file_hash")?
+        .binary()?
+        .into_iter()
+        .map(|h| -> [u8; 32] { h.unwrap().try_into().unwrap() });
+
+    let compression = df
+        .column("compression")?
+        .utf8()?
+        .into_iter()
+        .map(|c| c.unwrap());
+
+    let db: sled::Db = sled::open(args.outfile).unwrap();
+    let files_tree = db.open_tree("files").unwrap();
+
+    let res = stream::iter(file_hash.zip(compression))
+        .map(Ok)
+        .try_for_each_concurrent(Some(16), |(file_hash, compression)| {
+            let progress = progress.clone();
+            let files_tree = files_tree.clone();
+            async move {
+                if files_tree.contains_key(&file_hash)? {
+                    progress.inc(1);
+                    return Ok(());
+                }
+
+                let reader = remote::nar(file_hash, compression).await?;
+
+                tokio::task::spawn_blocking(move || {
+                    let mut reader = Sha256Reader::from(reader);
+
+                    let path =
+                        ingest(nar::open(&mut reader)?, vec![], avg_chunk_size).map(|node| {
+                            proto::Path {
+                                nar_hash: reader.finalize().as_slice().into(),
+                                node: Some(node),
+                            }
+                        })?;
+
+                    files_tree.insert(file_hash, path.encode_to_vec())?;
+                    progress.inc(1);
+
+                    Ok::<_, anyhow::Error>(())
+                })
+                .await?
+            }
+        })
+        .await;
+
+    let flush = files_tree.flush_async().await;
+
+    res?;
+    flush?;
+
+    Ok(())
+}
+
+fn ingest(node: nar::Node, name: Vec<u8>, avg_chunk_size: u32) -> Result<proto::path::Node> {
+    match node {
+        nar::Node::Symlink { target } => Ok(proto::path::Node::Symlink(proto::SymlinkNode {
+            name,
+            target,
+        })),
+
+        nar::Node::Directory(mut reader) => {
+            let mut directories = vec![];
+            let mut files = vec![];
+            let mut symlinks = vec![];
+
+            while let Some(node) = reader.next()? {
+                match ingest(node.node, node.name.to_owned(), avg_chunk_size)? {
+                    proto::path::Node::Directory(node) => {
+                        directories.push(node);
+                    }
+                    proto::path::Node::File(node) => {
+                        files.push(node);
+                    }
+                    proto::path::Node::Symlink(node) => {
+                        symlinks.push(node);
+                    }
+                }
+            }
+
+            Ok(proto::path::Node::Directory(proto::DirectoryNode {
+                name,
+                directories,
+                files,
+                symlinks,
+            }))
+        }
+
+        nar::Node::File { executable, reader } => {
+            let mut reader = B3Reader::from(reader);
+            let mut chunks = vec![];
+
+            for chunk in StreamCDC::new(
+                &mut reader,
+                avg_chunk_size / 2,
+                avg_chunk_size,
+                avg_chunk_size * 2,
+            ) {
+                let ChunkData {
+                    length: size, data, ..
+                } = chunk?;
+
+                let hash = blake3::hash(&data);
+                let size_compressed = zstd_size(&data, 9);
+
+                chunks.push(proto::Chunk {
+                    hash: hash.as_bytes().as_slice().into(),
+                    size: size.try_into().unwrap(),
+                    size_compressed: size_compressed.try_into().unwrap(),
+                });
+            }
+
+            Ok(proto::path::Node::File(proto::FileNode {
+                name,
+                hash: reader.finalize().as_bytes().as_slice().into(),
+                chunks,
+                executable,
+            }))
+        }
+    }
+}
+
+struct Sha256Reader<R> {
+    inner: R,
+    hasher: Sha256,
+    buf: *const [u8],
+}
+
+const ZERO_BUF: *const [u8] = ptr::slice_from_raw_parts(1 as *const u8, 0);
+
+unsafe impl<R: Send> Send for Sha256Reader<R> {}
+
+impl<R> From<R> for Sha256Reader<R> {
+    fn from(value: R) -> Self {
+        Self {
+            inner: value,
+            hasher: Sha256::new(),
+            buf: ZERO_BUF,
+        }
+    }
+}
+
+impl<R: Read> Read for Sha256Reader<R> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        self.buf = ZERO_BUF;
+        let n = self.inner.read(buf)?;
+        self.hasher.update(&buf[..n]);
+        Ok(n)
+    }
+}
+
+impl<R: BufRead> BufRead for Sha256Reader<R> {
+    fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        self.buf = ZERO_BUF;
+        let buf = self.inner.fill_buf()?;
+        self.buf = buf as *const [u8];
+        Ok(buf)
+    }
+
+    fn consume(&mut self, amt: usize) {
+        // UNSAFETY: This assumes that `R::consume` doesn't invalidate the buffer.
+        // That's not a sound assumption in general, though it is likely to hold.
+        // TODO(edef): refactor this codebase to write a fresh NAR for verification purposes
+        // we already buffer full chunks, so there's no pressing need to reuse the input buffers
+        unsafe {
+            let (head, buf) = (*self.buf).split_at(amt);
+            self.buf = buf as *const [u8];
+            self.hasher.update(head);
+            self.inner.consume(amt);
+        }
+    }
+}
+
+impl<R> Sha256Reader<R> {
+    fn finalize(self) -> [u8; 32] {
+        self.hasher.finalize().into()
+    }
+}
+
+struct B3Reader<R> {
+    inner: R,
+    hasher: blake3::Hasher,
+}
+
+impl<R> From<R> for B3Reader<R> {
+    fn from(value: R) -> Self {
+        Self {
+            inner: value,
+            hasher: blake3::Hasher::new(),
+        }
+    }
+}
+
+impl<R: Read> Read for B3Reader<R> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let n = self.inner.read(buf)?;
+        self.hasher.update(&buf[..n]);
+        Ok(n)
+    }
+}
+
+impl<R> B3Reader<R> {
+    fn finalize(self) -> blake3::Hash {
+        self.hasher.finalize()
+    }
+}
+
+fn zstd_size(data: &[u8], level: i32) -> u64 {
+    let mut w = zstd::Encoder::new(CountingWriter::default(), level).unwrap();
+    w.write_all(&data).unwrap();
+    let CountingWriter(size) = w.finish().unwrap();
+    size
+}
+
+#[derive(Default)]
+struct CountingWriter(u64);
+
+impl Write for CountingWriter {
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.0 += buf.len() as u64;
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        Ok(())
+    }
+}
--- a/contrib/crunch-v2/src/remote.rs
+++ b/contrib/crunch-v2/src/remote.rs
@ -0,0 +1,211 @@
+use std::{
+    cmp,
+    io::{self, BufRead, BufReader, Read},
+    pin::Pin,
+    task::{self, Poll},
+};
+
+use anyhow::{bail, Result};
+use bytes::{Buf, Bytes};
+use futures::{future::BoxFuture, Future, FutureExt, Stream, StreamExt};
+use lazy_static::lazy_static;
+use tokio::runtime::Handle;
+
+use nix_compat::nixbase32;
+
+use rusoto_core::{ByteStream, Region};
+use rusoto_s3::{GetObjectOutput, GetObjectRequest, S3Client, S3};
+
+use bzip2::read::BzDecoder;
+use xz2::read::XzDecoder;
+
+lazy_static! {
+    static ref S3_CLIENT: S3Client = S3Client::new(Region::UsEast1);
+}
+
+const BUCKET: &str = "nix-cache";
+
+pub async fn nar(
+    file_hash: [u8; 32],
+    compression: &str,
+) -> Result<Box<BufReader<dyn Read + Send>>> {
+    let (extension, decompress): (&'static str, fn(_) -> Box<_>) = match compression {
+        "bzip2" => ("bz2", decompress_bz2),
+        "xz" => ("xz", decompress_xz),
+        _ => bail!("unknown compression: {compression}"),
+    };
+
+    Ok(decompress(
+        FileStream::new(FileKey {
+            file_hash,
+            extension,
+        })
+        .await?
+        .into(),
+    ))
+}
+
+fn decompress_xz(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
+    Box::new(BufReader::new(XzDecoder::new(reader)))
+}
+
+fn decompress_bz2(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
+    Box::new(BufReader::new(BzDecoder::new(reader)))
+}
+
+struct FileStreamReader {
+    inner: FileStream,
+    buffer: Bytes,
+}
+
+impl From<FileStream> for FileStreamReader {
+    fn from(value: FileStream) -> Self {
+        FileStreamReader {
+            inner: value,
+            buffer: Bytes::new(),
+        }
+    }
+}
+
+impl Read for FileStreamReader {
+    fn read(&mut self, dst: &mut [u8]) -> io::Result<usize> {
+        let src = self.fill_buf()?;
+        let n = cmp::min(src.len(), dst.len());
+        dst[..n].copy_from_slice(&src[..n]);
+        self.consume(n);
+        Ok(n)
+    }
+}
+
+impl BufRead for FileStreamReader {
+    fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        if !self.buffer.is_empty() {
+            return Ok(&self.buffer);
+        }
+
+        self.buffer = Handle::current()
+            .block_on(self.inner.next())
+            .transpose()?
+            .unwrap_or_default();
+
+        Ok(&self.buffer)
+    }
+
+    fn consume(&mut self, cnt: usize) {
+        self.buffer.advance(cnt);
+    }
+}
+
+struct FileKey {
+    file_hash: [u8; 32],
+    extension: &'static str,
+}
+
+impl FileKey {
+    fn get(
+        &self,
+        offset: u64,
+        e_tag: Option<&str>,
+    ) -> impl Future<Output = io::Result<GetObjectOutput>> + Send + 'static {
+        let input = GetObjectRequest {
+            bucket: BUCKET.to_string(),
+            key: format!(
+                "nar/{}.nar.{}",
+                nixbase32::encode(&self.file_hash),
+                self.extension
+            ),
+            if_match: e_tag.map(str::to_owned),
+            range: Some(format!("bytes {}-", offset + 1)),
+            ..Default::default()
+        };
+
+        async {
+            S3_CLIENT
+                .get_object(input)
+                .await
+                .map_err(|e| io::Error::new(io::ErrorKind::Other, e))
+        }
+    }
+}
+
+struct FileStream {
+    key: FileKey,
+    e_tag: String,
+    offset: u64,
+    length: u64,
+    inner: FileStreamState,
+}
+
+enum FileStreamState {
+    Response(BoxFuture<'static, io::Result<GetObjectOutput>>),
+    Body(ByteStream),
+    Eof,
+}
+
+impl FileStream {
+    pub async fn new(key: FileKey) -> io::Result<Self> {
+        let resp = key.get(0, None).await?;
+
+        Ok(FileStream {
+            key,
+            e_tag: resp.e_tag.unwrap(),
+            offset: 0,
+            length: resp.content_length.unwrap().try_into().unwrap(),
+            inner: FileStreamState::Body(resp.body.unwrap()),
+        })
+    }
+}
+
+macro_rules! poll {
+    ($expr:expr) => {
+        match $expr {
+            Poll::Pending => {
+                return Poll::Pending;
+            }
+            Poll::Ready(value) => value,
+        }
+    };
+}
+
+impl Stream for FileStream {
+    type Item = io::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut task::Context) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+
+        let chunk = loop {
+            match &mut this.inner {
+                FileStreamState::Response(resp) => match poll!(resp.poll_unpin(cx)) {
+                    Err(err) => {
+                        this.inner = FileStreamState::Eof;
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                    Ok(resp) => {
+                        this.inner = FileStreamState::Body(resp.body.unwrap());
+                    }
+                },
+                FileStreamState::Body(body) => match poll!(body.poll_next_unpin(cx)) {
+                    None | Some(Err(_)) => {
+                        this.inner = FileStreamState::Response(
+                            this.key.get(this.offset, Some(&this.e_tag)).boxed(),
+                        );
+                    }
+                    Some(Ok(chunk)) => {
+                        break chunk;
+                    }
+                },
+                FileStreamState::Eof => {
+                    return Poll::Ready(None);
+                }
+            }
+        };
+
+        this.offset += chunk.len() as u64;
+
+        if this.offset >= this.length {
+            this.inner = FileStreamState::Eof;
+        }
+
+        Poll::Ready(Some(Ok(chunk)))
+    }
+}
--- a/contrib/fetchroots/.gitignore
+++ b/contrib/fetchroots/.gitignore
@ -0,0 +1,2 @@
+/target
+/roots.parquet
--- a/contrib/fetchroots/Cargo.lock
+++ b/contrib/fetchroots/Cargo.lock
--- a/contrib/fetchroots/Cargo.nix
+++ b/contrib/fetchroots/Cargo.nix
--- a/contrib/fetchroots/Cargo.toml
+++ b/contrib/fetchroots/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "fetchroots"
+version = "0.0.0"
+edition = "2021"
+
+[workspace]
+members = ["."]
+
+[dependencies]
+anyhow = { version = "1.0.80", features = ["backtrace"] }
+aws-config = "1.1.6"
+aws-sdk-s3 = "1.16.0"
+bytes = "1.5.0"
+bytes-utils = "0.1.4"
+bzip2 = "0.4.4"
+chrono = "0.4.34"
+futures = "0.3.30"
+indicatif = "0.17.8"
+nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
+polars = { version = "0.36.2", features = ["parquet"] }
+rayon = "1.8.1"
+tokio = { version = "1.36.0", features = ["full"] }
+xz2 = "0.1.7"
--- a/contrib/fetchroots/README.md
+++ b/contrib/fetchroots/README.md
@ -0,0 +1,36 @@
+# fetchroots
+
+> This tool is part of a suite of tools built to manage cache.nixos.org.
+
+This tool's purpose is to build an index of all the GC roots from the
+channels.nixos.org releases. The result is then combined with other tools.
+
+It does this by:
+1. Listing all the release files in the bucket.
+2. Getting the data for each of the release.
+3. Putting them in a local parquet file.
+
+## Getting started
+
+In order to run this, you'll need AWS SSO credentials from the NixOS Infra team.
+
+Get the creds from https://nixos.awsapps.com/start/ -> LBNixOS_Dev_PDX -> AWSReadOnlyAccess.
+
+Run `mg run`, you should see a progress bar.
+
+Congrats, you now have a `roots.parquet` file. You can now load it with python polars-rs or clickhouse.
+
+## `roots.parquet` file format
+
+ * `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
+ * `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
+ * `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
+
+## Development
+
+When the Cargo.lock changes, run `mg run //tools:crate2nix-generate`.
+
+To build the project, run `mg build`.
+
+To get a dev environment, run `nix-shell -p cargo`.
+
--- a/contrib/fetchroots/default.nix
+++ b/contrib/fetchroots/default.nix
@ -0,0 +1,11 @@
+{ pkgs, depot, ... }:
+
+(pkgs.callPackage ./Cargo.nix {
+  defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
+    fetchroots = prev: {
+      src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
+    };
+  };
+}).rootCrate.build.overrideAttrs {
+  meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
+}
--- a/contrib/fetchroots/src/main.rs
+++ b/contrib/fetchroots/src/main.rs
@ -0,0 +1,257 @@
+//! Fetch all[^1] GC roots from releases.nixos.org into a `roots.parquet` file.
+//!
+//! The resulting Parquet has three columns:
+//!
+//!  * `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
+//!  * `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
+//!  * `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
+//!
+//! [^1]: some roots are truly ancient, and aren't compatible with Nix 1.x
+
+use anyhow::Result;
+use std::{
+    collections::BTreeMap,
+    fs::File,
+    io::{BufRead, Read},
+    sync::Arc,
+    time::SystemTime,
+};
+
+use aws_config::Region;
+use aws_sdk_s3::operation::get_object::builders::GetObjectFluentBuilder;
+use bytes::{Buf, Bytes};
+use bytes_utils::SegmentedBuf;
+use chrono::{DateTime, Utc};
+use nix_compat::nixbase32;
+use polars::prelude::*;
+use tokio::{
+    sync::Semaphore,
+    task::{block_in_place, JoinSet},
+};
+
+#[derive(Debug)]
+struct Meta {
+    format: Format,
+    e_tag: String,
+    last_modified: DateTime<Utc>,
+}
+
+#[tokio::main]
+async fn main() {
+    let sdk_config = aws_config::load_defaults(aws_config::BehaviorVersion::v2023_11_09())
+        .await
+        .into_builder()
+        .region(Region::from_static("eu-west-1"))
+        .build();
+
+    let s3 = aws_sdk_s3::Client::new(&sdk_config);
+
+    let mut keys: BTreeMap<String, Meta> = {
+        let pages = s3
+            .list_objects_v2()
+            .bucket("nix-releases")
+            .into_paginator()
+            .send()
+            .try_collect()
+            .await
+            .unwrap();
+
+        let objects = pages.into_iter().flat_map(|page| {
+            assert_eq!(page.prefix().unwrap_or_default(), "");
+            assert!(page.common_prefixes.is_none());
+            page.contents.unwrap_or_default()
+        });
+
+        let mut prev_key = String::new();
+        objects
+            .filter_map(|obj| {
+                let key = obj.key().unwrap();
+
+                assert!(&*prev_key < key);
+                key.clone_into(&mut prev_key);
+
+                let (key, tail) = key.rsplit_once('/')?;
+                // Our preference order happens to match lexicographical order,
+                // and listings are returned in lexicographical order.
+                let format = match tail {
+                    "MANIFEST" => Format::Manifest,
+                    "MANIFEST.bz2" => Format::ManifestBz,
+                    "store-paths.xz" => Format::StorePathsXz,
+                    _ => return None,
+                };
+
+                Some((
+                    key.to_owned(),
+                    Meta {
+                        format,
+                        e_tag: obj.e_tag.unwrap(),
+                        last_modified: SystemTime::try_from(obj.last_modified.unwrap())
+                            .unwrap()
+                            .into(),
+                    },
+                ))
+            })
+            .collect()
+    };
+
+    // These releases are so old they don't even use nixbase32 store paths.
+    for key in [
+        "nix/nix-0.6",
+        "nix/nix-0.6.1",
+        "nix/nix-0.7",
+        "nix/nix-0.8",
+        "nixpkgs/nixpkgs-0.5",
+        "nixpkgs/nixpkgs-0.5.1",
+        "nixpkgs/nixpkgs-0.6",
+        "nixpkgs/nixpkgs-0.7",
+        "nixpkgs/nixpkgs-0.8",
+        "nixpkgs/nixpkgs-0.9",
+        "nixpkgs/nixpkgs-0.10",
+        "nixpkgs/nixpkgs-0.11",
+    ] {
+        assert!(keys.remove(key).is_some());
+    }
+
+    let mut js = JoinSet::new();
+    let sem = Arc::new(Semaphore::new(16));
+
+    let bar = indicatif::ProgressBar::new(keys.len() as u64);
+    for (root, meta) in keys {
+        let sem = sem.clone();
+        let s3 = s3.clone();
+
+        js.spawn(async move {
+            let _permit = sem.acquire().await.unwrap();
+
+            let body = get_object(
+                s3.get_object()
+                    .bucket("nix-releases")
+                    .key(format!("{root}/{}", meta.format.as_str()))
+                    .if_match(meta.e_tag),
+            )
+            .await
+            .unwrap()
+            .reader();
+
+            let ph_array = block_in_place(|| meta.format.to_ph_array(body).rechunk());
+            df! {
+                "key" => [root],
+                "timestamp" => [meta.last_modified.naive_utc()],
+                "store_path_hash" => ph_array.into_series().implode().unwrap()
+            }
+            .unwrap()
+        });
+    }
+
+    let mut writer = ParquetWriter::new(File::create("roots.parquet").unwrap())
+        .batched(&Schema::from_iter([
+            Field::new("key", DataType::String),
+            Field::new(
+                "timestamp",
+                DataType::Datetime(TimeUnit::Milliseconds, None),
+            ),
+            Field::new(
+                "store_path_hash",
+                DataType::List(Box::new(DataType::Binary)),
+            ),
+        ]))
+        .unwrap();
+
+    while let Some(df) = js.join_next().await.transpose().unwrap() {
+        block_in_place(|| writer.write_batch(&df)).unwrap();
+        bar.inc(1);
+    }
+
+    writer.finish().unwrap();
+}
+
+#[derive(Debug)]
+enum Format {
+    Manifest,
+    ManifestBz,
+    StorePathsXz,
+}
+
+impl Format {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Format::Manifest => "MANIFEST",
+            Format::ManifestBz => "MANIFEST.bz2",
+            Format::StorePathsXz => "store-paths.xz",
+        }
+    }
+
+    fn to_ph_array(&self, mut body: impl BufRead) -> BinaryChunked {
+        match self {
+            Format::Manifest | Format::ManifestBz => {
+                let mut buf = String::new();
+                match self {
+                    Format::Manifest => {
+                        body.read_to_string(&mut buf).unwrap();
+                    }
+                    Format::ManifestBz => {
+                        bzip2::bufread::BzDecoder::new(body)
+                            .read_to_string(&mut buf)
+                            .unwrap();
+                    }
+                    _ => unreachable!(),
+                }
+
+                let buf = buf
+                    .strip_prefix("version {\n  ManifestVersion: 3\n}\n")
+                    .unwrap();
+
+                BinaryChunked::from_iter_values(
+                    "store_path_hash",
+                    buf.split_terminator("}\n").map(|chunk| -> [u8; 20] {
+                        let chunk = chunk.strip_prefix("patch ").unwrap_or(chunk);
+                        let line = chunk.strip_prefix("{\n  StorePath: /nix/store/").unwrap();
+                        nixbase32::decode_fixed(&line[..32]).unwrap()
+                    }),
+                )
+            }
+            Format::StorePathsXz => {
+                let mut buf = String::new();
+                xz2::bufread::XzDecoder::new(body)
+                    .read_to_string(&mut buf)
+                    .unwrap();
+
+                BinaryChunked::from_iter_values(
+                    "store_path_hash",
+                    buf.split_terminator('\n').map(|line| -> [u8; 20] {
+                        let line = line.strip_prefix("/nix/store/").unwrap();
+                        nixbase32::decode_fixed(&line[..32]).unwrap()
+                    }),
+                )
+            }
+        }
+    }
+}
+
+async fn get_object(request: GetObjectFluentBuilder) -> Result<SegmentedBuf<Bytes>> {
+    // if we don't constrain the ETag, we might experience read skew
+    assert!(request.get_if_match().is_some(), "if_match must be set");
+
+    let mut buf: SegmentedBuf<Bytes> = SegmentedBuf::new();
+    let mut resp = request.clone().send().await?;
+    let content_length: usize = resp.content_length.unwrap().try_into().unwrap();
+
+    loop {
+        while let Ok(Some(chunk)) = resp.body.try_next().await {
+            buf.push(chunk);
+        }
+
+        if buf.remaining() >= content_length {
+            assert_eq!(buf.remaining(), content_length, "got excess bytes");
+            break Ok(buf);
+        }
+
+        resp = request
+            .clone()
+            .range(format!("bytes={}-", buf.remaining()))
+            .send()
+            .await?;
+
+        assert_ne!(resp.content_range, None);
+    }
+}
--- a/contrib/narinfo2parquet/Cargo.lock
+++ b/contrib/narinfo2parquet/Cargo.lock
--- a/contrib/narinfo2parquet/Cargo.nix
+++ b/contrib/narinfo2parquet/Cargo.nix
--- a/contrib/narinfo2parquet/Cargo.toml
+++ b/contrib/narinfo2parquet/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+name = "narinfo2parquet"
+version = "0.1.0"
+edition = "2021"
+
+# We can't join the //tvix workspace, because that locks zstd
+# at an ancient version, which is incompatible with polars
+[workspace]
+members = ["."]
+
+[dependencies]
+anyhow = { version = "1.0.75", features = ["backtrace"] }
+jemallocator = "0.5.4"
+nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
+tempfile-fast = "0.3.4"
+zstd = "0.13.0"
+
+# See https://github.com/pola-rs/polars/issues/19157
+hashbrown = { version = "0.14.5", features = ["raw"] }
+
+[dependencies.polars]
+version = "0.36.2"
+default-features = false
+features = [
+    "parquet",
+    "polars-io",
+    "dtype-categorical"
+]
--- a/contrib/narinfo2parquet/OWNERS
+++ b/contrib/narinfo2parquet/OWNERS
@ -0,0 +1 @@
+edef
--- a/contrib/narinfo2parquet/default.nix
+++ b/contrib/narinfo2parquet/default.nix
@ -0,0 +1,11 @@
+{ pkgs, depot, ... }:
+
+(pkgs.callPackage ./Cargo.nix {
+  defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
+    narinfo2parquet = prev: {
+      src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
+    };
+  };
+}).rootCrate.build.overrideAttrs {
+  meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
+}
--- a/contrib/narinfo2parquet/src/main.rs
+++ b/contrib/narinfo2parquet/src/main.rs
@ -0,0 +1,264 @@
+//! narinfo2parquet operates on a narinfo.zst directory produced by turbofetch.
+//! It takes the name of a segment file in `narinfo.zst` and writes a Parquet file
+//! with the same name into the `narinfo.pq` directory.
+//!
+//! Run it under GNU Parallel for parallelism:
+//! ```shell
+//! mkdir narinfo.pq && ls narinfo.zst | parallel --bar 'narinfo2parquet {}'
+//! ```
+
+use anyhow::{bail, Context, Result};
+use jemallocator::Jemalloc;
+use nix_compat::{
+    narinfo::{self, NarInfo},
+    nixbase32,
+};
+use polars::{io::parquet::ParquetWriter, prelude::*};
+use std::{
+    fs::{self, File},
+    io::{self, BufRead, BufReader, Read},
+    path::Path,
+};
+use tempfile_fast::PersistableTempFile;
+
+#[global_allocator]
+static GLOBAL: Jemalloc = Jemalloc;
+
+fn main() -> Result<()> {
+    let file_name = std::env::args().nth(1).expect("file name missing");
+    let input_path = Path::new("narinfo.zst").join(&file_name);
+    let output_path = Path::new("narinfo.pq").join(&file_name);
+
+    match fs::metadata(&output_path) {
+        Err(e) if e.kind() == io::ErrorKind::NotFound => {}
+        Err(e) => bail!(e),
+        Ok(_) => bail!("output path already exists: {output_path:?}"),
+    }
+
+    let reader = File::open(input_path).and_then(zstd::Decoder::new)?;
+    let mut frame = FrameBuilder::default();
+
+    for_each(reader, |s| {
+        let entry = NarInfo::parse(&s).context("couldn't parse entry:\n{s}")?;
+        frame.push(&entry);
+        Ok(())
+    })?;
+
+    let mut frame = frame.finish();
+    let mut writer = PersistableTempFile::new_in(output_path.parent().unwrap())?;
+
+    ParquetWriter::new(&mut writer)
+        .with_compression(ParquetCompression::Gzip(None))
+        .with_statistics(true)
+        .finish(frame.align_chunks())?;
+
+    writer
+        .persist_noclobber(output_path)
+        .map_err(|e| e.error)
+        .context("couldn't commit output file")?;
+
+    Ok(())
+}
+
+fn for_each(reader: impl Read, mut f: impl FnMut(&str) -> Result<()>) -> Result<()> {
+    let mut reader = BufReader::new(reader);
+    let mut group = String::new();
+    loop {
+        let prev_len = group.len();
+
+        if prev_len > 1024 * 1024 {
+            bail!("excessively large segment");
+        }
+
+        reader.read_line(&mut group)?;
+        let (prev, line) = group.split_at(prev_len);
+
+        // EOF
+        if line.is_empty() {
+            break;
+        }
+
+        // skip empty line
+        if line == "\n" {
+            group.pop().unwrap();
+            continue;
+        }
+
+        if !prev.is_empty() && line.starts_with("StorePath:") {
+            f(prev)?;
+            group.drain(..prev_len);
+        }
+    }
+
+    if !group.is_empty() {
+        f(&group)?;
+    }
+
+    Ok(())
+}
+
+/// [FrameBuilder] builds a [DataFrame] out of [NarInfo]s.
+/// The exact format is still in flux.
+///
+/// # Example
+///
+/// ```no_run
+/// |narinfos: &[NarInfo]| -> DataFrame {
+///     let frame_builder = FrameBuilder::default();
+///     narinfos.for_each(|n| frame_builder.push(n));
+///     frame_builder.finish()
+/// }
+/// ```
+struct FrameBuilder {
+    store_path_hash_str: StringChunkedBuilder,
+    store_path_hash: BinaryChunkedBuilder,
+    store_path_name: StringChunkedBuilder,
+    deriver_hash_str: StringChunkedBuilder,
+    deriver_hash: BinaryChunkedBuilder,
+    deriver_name: StringChunkedBuilder,
+    nar_hash: BinaryChunkedBuilder,
+    nar_size: PrimitiveChunkedBuilder<UInt64Type>,
+    references: ListBinaryChunkedBuilder,
+    ca_algo: CategoricalChunkedBuilder<'static>,
+    ca_hash: BinaryChunkedBuilder,
+    signature: BinaryChunkedBuilder,
+    file_hash: BinaryChunkedBuilder,
+    file_size: PrimitiveChunkedBuilder<UInt64Type>,
+    compression: CategoricalChunkedBuilder<'static>,
+    quirk_references_out_of_order: BooleanChunkedBuilder,
+    quirk_nar_hash_hex: BooleanChunkedBuilder,
+}
+
+impl Default for FrameBuilder {
+    fn default() -> Self {
+        Self {
+            store_path_hash_str: StringChunkedBuilder::new("store_path_hash_str", 0, 0),
+            store_path_hash: BinaryChunkedBuilder::new("store_path_hash", 0, 0),
+            store_path_name: StringChunkedBuilder::new("store_path_name", 0, 0),
+            deriver_hash_str: StringChunkedBuilder::new("deriver_hash_str", 0, 0),
+            deriver_hash: BinaryChunkedBuilder::new("deriver_hash", 0, 0),
+            deriver_name: StringChunkedBuilder::new("deriver_name", 0, 0),
+            nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
+            nar_size: PrimitiveChunkedBuilder::new("nar_size", 0),
+            references: ListBinaryChunkedBuilder::new("references", 0, 0),
+            signature: BinaryChunkedBuilder::new("signature", 0, 0),
+            ca_algo: CategoricalChunkedBuilder::new("ca_algo", 0, CategoricalOrdering::Lexical),
+            ca_hash: BinaryChunkedBuilder::new("ca_hash", 0, 0),
+            file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
+            file_size: PrimitiveChunkedBuilder::new("file_size", 0),
+            compression: CategoricalChunkedBuilder::new(
+                "compression",
+                0,
+                CategoricalOrdering::Lexical,
+            ),
+            quirk_references_out_of_order: BooleanChunkedBuilder::new(
+                "quirk_references_out_of_order",
+                0,
+            ),
+            quirk_nar_hash_hex: BooleanChunkedBuilder::new("quirk_nar_hash_hex", 0),
+        }
+    }
+}
+
+impl FrameBuilder {
+    fn push(&mut self, entry: &NarInfo) {
+        self.store_path_hash_str
+            .append_value(nixbase32::encode(entry.store_path.digest()));
+        self.store_path_hash.append_value(entry.store_path.digest());
+        self.store_path_name.append_value(entry.store_path.name());
+
+        if let Some(deriver) = &entry.deriver {
+            self.deriver_hash_str
+                .append_value(nixbase32::encode(deriver.digest()));
+            self.deriver_hash.append_value(deriver.digest());
+            self.deriver_name.append_value(deriver.name());
+        } else {
+            self.deriver_hash_str.append_null();
+            self.deriver_hash.append_null();
+            self.deriver_name.append_null();
+        }
+
+        self.nar_hash.append_value(&entry.nar_hash);
+        self.nar_size.append_value(entry.nar_size);
+
+        self.references
+            .append_values_iter(entry.references.iter().map(|r| r.digest().as_slice()));
+
+        assert!(entry.signatures.len() <= 1);
+        self.signature
+            .append_option(entry.signatures.get(0).map(|sig| {
+                assert_eq!(sig.name(), &"cache.nixos.org-1");
+                sig.bytes()
+            }));
+
+        if let Some(ca) = &entry.ca {
+            self.ca_algo.append_value(ca.algo_str());
+            self.ca_hash.append_value(ca.hash().digest_as_bytes());
+        } else {
+            self.ca_algo.append_null();
+            self.ca_hash.append_null();
+        }
+
+        let file_hash = entry.file_hash.as_ref().unwrap();
+        let file_size = entry.file_size.unwrap();
+
+        self.file_hash.append_value(file_hash);
+        self.file_size.append_value(file_size);
+
+        let (compression, extension) = match entry.compression {
+            Some("bzip2") => ("bzip2", "bz2"),
+            Some("xz") => ("xz", "xz"),
+            Some("zstd") => ("zstd", "zst"),
+            x => panic!("unknown compression algorithm: {x:?}"),
+        };
+
+        self.compression.append_value(compression);
+
+        let mut file_name = nixbase32::encode(file_hash);
+        file_name.push_str(".nar.");
+        file_name.push_str(extension);
+
+        assert_eq!(entry.url.strip_prefix("nar/").unwrap(), file_name);
+
+        {
+            use narinfo::Flags;
+
+            self.quirk_references_out_of_order
+                .append_value(entry.flags.contains(Flags::REFERENCES_OUT_OF_ORDER));
+
+            self.quirk_nar_hash_hex
+                .append_value(entry.flags.contains(Flags::NAR_HASH_HEX));
+
+            let quirks = Flags::REFERENCES_OUT_OF_ORDER | Flags::NAR_HASH_HEX;
+            let unknown_flags = entry.flags.difference(quirks);
+
+            assert!(
+                unknown_flags.is_empty(),
+                "rejecting flags: {unknown_flags:?}"
+            );
+        }
+    }
+
+    fn finish(mut self) -> DataFrame {
+        df! {
+            "store_path_hash_str" => self.store_path_hash_str.finish().into_series(),
+            "store_path_hash" => self.store_path_hash.finish().into_series(),
+            "store_path_name" => self.store_path_name.finish().into_series(),
+            "deriver_hash_str" => self.deriver_hash_str.finish().into_series(),
+            "deriver_hash" => self.deriver_hash.finish().into_series(),
+            "deriver_name" => self.deriver_name.finish().into_series(),
+            "nar_hash" => self.nar_hash.finish().into_series(),
+            "nar_size" => self.nar_size.finish().into_series(),
+            "references" => self.references.finish().into_series(),
+            "signature" => self.signature.finish().into_series(),
+            "ca_algo" => self.ca_algo.finish().into_series(),
+            "ca_hash" => self.ca_hash.finish().into_series(),
+            "file_hash" => self.file_hash.finish().into_series(),
+            "file_size" => self.file_size.finish().into_series(),
+            "compression" => self.compression.finish().into_series(),
+            "quirk_references_out_of_order" => self.quirk_references_out_of_order.finish().into_series(),
+            "quirk_nar_hash_hex" => self.quirk_nar_hash_hex.finish().into_series()
+        }
+        .unwrap()
+    }
+}
--- a/contrib/refscan/.gitignore
+++ b/contrib/refscan/.gitignore
@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: edef <edef@edef.eu>
+# SPDX-License-Identifier: CC0-1.0
+
+/target
+**/*.rs.bk
--- a/contrib/refscan/Cargo.lock
+++ b/contrib/refscan/Cargo.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "refscan"
+version = "0.1.0"
--- a/contrib/refscan/Cargo.lock.license
+++ b/contrib/refscan/Cargo.lock.license
@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: edef <edef@edef.eu>
+SPDX-License-Identifier: CC0-1.0
+
--- a/contrib/refscan/Cargo.toml
+++ b/contrib/refscan/Cargo.toml
@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: edef <edef@edef.eu>
+# SPDX-License-Identifier: MPL-2.0
+
+[package]
+name = "refscan"
+version = "0.1.0"
+authors = ["edef <edef@edef.eu>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
--- a/contrib/refscan/LICENSES/CC0-1.0.txt
+++ b/contrib/refscan/LICENSES/CC0-1.0.txt
@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
--- a/contrib/refscan/LICENSES/MPL-2.0.txt
+++ b/contrib/refscan/LICENSES/MPL-2.0.txt
@ -0,0 +1,373 @@
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+    means
+
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in 
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
--- a/contrib/refscan/src/lib.rs
+++ b/contrib/refscan/src/lib.rs
@ -0,0 +1,154 @@
+// SPDX-FileCopyrightText: edef <edef@edef.eu>
+// SPDX-License-Identifier: MPL-2.0
+
+use self::simd::u8x32;
+
+fn prefilter(haystack: u8x32) -> u32 {
+    let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
+    let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
+    alp | num
+}
+
+/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
+/// or `Ok(buffer)` if the entire buffer is pointer-free.
+pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
+    let buffer = {
+        let n = buffer.len() & !31;
+        &buffer[..n]
+    };
+
+    let mut masks = buffer
+        .chunks_exact(32)
+        .map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)))
+        .enumerate()
+        .map(|e| (e.0 * 32, e.1))
+        .peekable();
+
+    while let Some((offset, mask)) = masks.next() {
+        let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
+        let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
+        if n >= 32 {
+            let offset = offset + mask.trailing_zeros() as usize;
+            return Err(&buffer[..offset]);
+        }
+    }
+
+    Ok(buffer)
+}
+
+#[cfg(test)]
+mod test {
+    #[test]
+    fn scan_tail() {
+        let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
+        assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
+    }
+    #[test]
+    fn scan_straddle() {
+        let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
+        assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
+    }
+    #[test]
+    fn scan_clean() {
+        let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
+        assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod simd {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86 as arch;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64 as arch;
+    use {
+        arch::{__m256i, _mm256_cmpgt_epi8, _mm256_movemask_epi8, _mm256_set1_epi8},
+        std::ptr,
+    };
+
+    #[allow(non_camel_case_types)]
+    #[derive(Copy, Clone)]
+    pub struct u8x32(__m256i);
+
+    impl u8x32 {
+        #[inline(always)]
+        pub fn from_slice_unaligned(slice: &[u8]) -> Self {
+            assert_eq!(slice.len(), 32);
+            u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
+        }
+
+        #[inline(always)]
+        pub fn splat(x: u8) -> Self {
+            u8x32(unsafe { _mm256_set1_epi8(x as i8) })
+        }
+
+        #[inline(always)]
+        pub fn gt(self, b: Self) -> u32 {
+            unsafe { _mm256_movemask_epi8(_mm256_cmpgt_epi8(self.0, b.0)) as u32 }
+        }
+
+        #[inline(always)]
+        pub fn lt(self, b: Self) -> u32 {
+            b.gt(self)
+        }
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod simd {
+    use std::{
+        arch::aarch64::{
+            uint8x16_t as u8x16, vaddv_u8, vandq_u8, vcgtq_u8, vdupq_n_u8, vget_high_u8,
+            vget_low_u8, vshlq_u8,
+        },
+        mem, ptr,
+    };
+
+    #[allow(non_camel_case_types)]
+    #[derive(Copy, Clone)]
+    #[repr(transparent)]
+    pub struct u8x32([u8x16; 2]);
+
+    impl u8x32 {
+        #[cfg(target_endian = "little")]
+        #[inline(always)]
+        pub fn from_slice_unaligned(slice: &[u8]) -> Self {
+            assert_eq!(slice.len(), 32);
+            u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
+        }
+
+        #[inline(always)]
+        pub fn splat(x: u8) -> Self {
+            u8x32(unsafe {
+                let x = vdupq_n_u8(x);
+                [x, x]
+            })
+        }
+
+        #[inline(always)]
+        pub fn gt(&self, b: Self) -> u32 {
+            let u8x32([al, ah]) = *self;
+            let u8x32([bl, bh]) = b;
+
+            fn f(a: u8x16, b: u8x16) -> u32 {
+                unsafe {
+                    let c = vshlq_u8(
+                        vandq_u8(vdupq_n_u8(0x80), vcgtq_u8(a, b)),
+                        mem::transmute([
+                            -7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0i8,
+                        ]),
+                    );
+
+                    (vaddv_u8(vget_low_u8(c)) as u32) << 0 | (vaddv_u8(vget_high_u8(c)) as u32) << 8
+                }
+            }
+
+            f(al, bl) << 0 | f(ah, bh) << 16
+        }
+
+        #[inline(always)]
+        pub fn lt(self, b: Self) -> u32 {
+            b.gt(self)
+        }
+    }
+}
--- a/contrib/refscan/src/main.rs
+++ b/contrib/refscan/src/main.rs
@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: edef <edef@edef.eu>
+// SPDX-License-Identifier: MPL-2.0
+
+use std::{
+    collections::BTreeSet as Set,
+    convert::TryInto,
+    io::{self, Read},
+    str,
+};
+
+fn main() {
+    let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
+        .lines()
+        .map(|l| l.as_bytes().try_into().unwrap())
+        .collect();
+
+    let input = {
+        let stdin = io::stdin();
+        let mut buffer = Vec::new();
+        stdin.lock().read_to_end(&mut buffer).unwrap();
+        buffer
+    };
+
+    let base = input.as_ptr() as usize;
+    let mut input: &[u8] = &input;
+    while input.len() >= 32 {
+        match refscan::scan_clean(&input) {
+            Ok(buffer) | Err(buffer) => {
+                let n = buffer.len();
+                input = &input[n..];
+            }
+        }
+
+        let buffer = {
+            let idx = input.iter().position(|x| match x {
+                b'a'..=b'z' | b'0'..=b'9' => false,
+                _ => true,
+            });
+            idx.map(|idx| &input[..idx]).unwrap_or(input)
+        };
+
+        for chunk in buffer.windows(32) {
+            let offset = (chunk.as_ptr() as usize) - base;
+            let chunk = {
+                let mut fixed = [0u8; 32];
+                fixed.copy_from_slice(chunk);
+                fixed
+            };
+            if max_refs.contains(&chunk) {
+                let seen = unsafe { str::from_utf8_unchecked(&chunk) };
+                println!("{} {}", seen, offset);
+            }
+        }
+
+        let n = buffer.len();
+        input = &input[n..];
+    }
+}
--- a/contrib/refscan/testdata/.gitignore
+++ b/contrib/refscan/testdata/.gitignore
@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: edef <edef@edef.eu>
+# SPDX-License-Identifier: CC0-1.0
+
+/maxrefs
+/nar
+/result
--- a/contrib/refscan/testdata/generate.sh
+++ b/contrib/refscan/testdata/generate.sh
@ -0,0 +1,8 @@
+#! /usr/bin/env bash
+# SPDX-FileCopyrightText: edef <edef@edef.eu>
+# SPDX-License-Identifier: CC0-1.0
+set -euo pipefail
+
+drv=$(nix-instantiate '<nixpkgs>' -A ghc)
+nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
+nix-store --dump "$(nix-build "$drv")" > nar
--- a/contrib/turbofetch/Cargo.lock
+++ b/contrib/turbofetch/Cargo.lock
--- a/contrib/turbofetch/Cargo.nix
+++ b/contrib/turbofetch/Cargo.nix
--- a/contrib/turbofetch/Cargo.toml
+++ b/contrib/turbofetch/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+name = "turbofetch"
+version = "0.1.0"
+edition = "2021"
+
+# We don't join the //snix workspace, as this is fairly cache.nixos.org-specific.
+[workspace]
+members = ["."]
+
+[dependencies]
+aws_lambda_events = { version = "0.11.1", default-features = false, features = ["lambda_function_urls"] }
+bytes = "1.5.0"
+data-encoding = "2.6.0"
+futures = { version = "0.3.30", default-features = false, features = ["std"] }
+httparse = "1.8.0"
+hyper = { version = "0.14.27", default-features = false }
+lambda_runtime = "0.8.2"
+magic-buffer = "0.1.1"
+rusoto_core = { version = "0.48.0", features = ["rustls"], default-features = false }
+rusoto_s3 = { version = "0.48.0", features = ["rustls"], default-features = false }
+serde_json = "1.0.108"
+serde = { version = "1.0.190", features = ["derive"] }
+tokio = { version = "1.33.0", features = ["full"] }
+tower = "0.4.13"
+# TODO(edef): zstd = "0.13.0"
+zstd = "0.9.0"
+tracing-subscriber = { version = "0.3.17", features = ["json"] }
+tracing = "0.1.40"
--- a/contrib/turbofetch/OWNERS
+++ b/contrib/turbofetch/OWNERS
@ -0,0 +1 @@
+edef
--- a/contrib/turbofetch/default.nix
+++ b/contrib/turbofetch/default.nix
@ -0,0 +1,11 @@
+{ pkgs, depot, ... }:
+
+(pkgs.callPackage ./Cargo.nix {
+  defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
+    turbofetch = prev: {
+      src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
+    };
+  };
+}).rootCrate.build.overrideAttrs {
+  meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
+}
--- a/contrib/turbofetch/deploy.sh
+++ b/contrib/turbofetch/deploy.sh
@ -0,0 +1,5 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -i "bash -e"
+#! nix-shell -p cargo-lambda
+cargo lambda build --release
+cargo lambda deploy
--- a/contrib/turbofetch/src/buffer.rs
+++ b/contrib/turbofetch/src/buffer.rs
@ -0,0 +1,83 @@
+use magic_buffer::MagicBuffer;
+use std::cell::Cell;
+
+/// Buffer is a FIFO queue for bytes, built on a ring buffer.
+/// It always provides contiguous slices for both the readable and writable parts,
+/// using an underlying buffer that is "mirrored" in virtual memory.
+pub struct Buffer {
+    buffer: MagicBuffer,
+    /// first readable byte
+    head: Cell<usize>,
+    /// first writable byte
+    tail: usize,
+}
+
+impl Buffer {
+    /// Allocate a fresh buffer, with the specified capacity.
+    /// The buffer can contain at most `capacity - 1` bytes.
+    /// The capacity must be a power of two, and at least [Buffer::min_len].
+    pub fn new(capacity: usize) -> Buffer {
+        Buffer {
+            // MagicBuffer::new verifies that `capacity` is a power of two,
+            // and at least MagicBuffer::min_len().
+            buffer: MagicBuffer::new(capacity).unwrap(),
+            // `head == tail` means the buffer is empty.
+            // In order to ensure that this remains unambiguous,
+            // the buffer can only be filled with capacity-1 bytes.
+            head: Cell::new(0),
+            tail: 0,
+        }
+    }
+
+    /// Returns the minimum buffer capacity.
+    /// This depends on the operating system and architecture.
+    pub fn min_capacity() -> usize {
+        MagicBuffer::min_len()
+    }
+
+    /// Return the capacity of the buffer.
+    /// This is equal to `self.data().len() + self.space().len() + 1`.
+    pub fn capacity(&self) -> usize {
+        self.buffer.len()
+    }
+
+    /// Return the valid, readable data in the buffer.
+    pub fn data(&self) -> &[u8] {
+        let len = self.buffer.len();
+        let head = self.head.get();
+
+        if head <= self.tail {
+            &self.buffer[head..self.tail]
+        } else {
+            &self.buffer[head..self.tail + len]
+        }
+    }
+
+    /// Mark `read_len` bytes of the readable data as consumed, freeing the space.
+    pub fn consume(&self, read_len: usize) {
+        debug_assert!(read_len <= self.data().len());
+        let mut head = self.head.get();
+        head += read_len;
+        head &= self.buffer.len() - 1;
+        self.head.set(head);
+    }
+
+    /// Return the empty, writable space in the buffer.
+    pub fn space(&mut self) -> &mut [u8] {
+        let len = self.buffer.len();
+        let head = self.head.get();
+
+        if head <= self.tail {
+            &mut self.buffer[self.tail..head + len - 1]
+        } else {
+            &mut self.buffer[self.tail..head - 1]
+        }
+    }
+
+    /// Mark `written_len` bytes of the writable space as valid, readable data.
+    pub fn commit(&mut self, written_len: usize) {
+        debug_assert!(written_len <= self.space().len());
+        self.tail += written_len;
+        self.tail &= self.buffer.len() - 1;
+    }
+}
--- a/contrib/turbofetch/src/lib.rs
+++ b/contrib/turbofetch/src/lib.rs
@ -0,0 +1,103 @@
+use std::{mem::MaybeUninit, str};
+use tokio::io::{self, AsyncRead, AsyncReadExt};
+
+pub use buffer::Buffer;
+mod buffer;
+
+/// Read as much data into `buffer` as possible.
+/// Returns [io::ErrorKind::OutOfMemory] if the buffer is already full.
+async fn slurp(buffer: &mut Buffer, sock: &mut (impl AsyncRead + Unpin)) -> io::Result<()> {
+    match buffer.space() {
+        [] => Err(io::Error::new(io::ErrorKind::OutOfMemory, "buffer filled")),
+        buf => {
+            let n = sock.read(buf).await?;
+            if n == 0 {
+                return Err(io::ErrorKind::UnexpectedEof.into());
+            }
+            buffer.commit(n);
+
+            Ok(())
+        }
+    }
+}
+
+fn get_content_length(headers: &[httparse::Header]) -> io::Result<u64> {
+    for header in headers {
+        if header.name == "Transfer-Encoding" {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Transfer-Encoding is unsupported",
+            ));
+        }
+
+        if header.name == "Content-Length" {
+            return str::from_utf8(header.value)
+                .ok()
+                .and_then(|v| v.parse().ok())
+                .ok_or_else(|| {
+                    io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Length")
+                });
+        }
+    }
+
+    Err(io::Error::new(
+        io::ErrorKind::InvalidData,
+        "Content-Length missing",
+    ))
+}
+
+/// Read an HTTP response from `sock` using `buffer`, returning the response body.
+/// Returns an error if anything but 200 OK is received.
+///
+/// The buffer must have enough space to contain the entire response body.
+/// If there is not enough space, [io::ErrorKind::OutOfMemory] is returned.
+///
+/// The HTTP response must use `Content-Length`, without `Transfer-Encoding`.
+pub async fn parse_response<'a>(
+    sock: &mut (impl AsyncRead + Unpin),
+    buffer: &'a mut Buffer,
+) -> io::Result<&'a [u8]> {
+    let body_len = loop {
+        let mut headers = [MaybeUninit::uninit(); 16];
+        let mut response = httparse::Response::new(&mut []);
+        let status = httparse::ParserConfig::default()
+            .parse_response_with_uninit_headers(&mut response, buffer.data(), &mut headers)
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        if let httparse::Status::Complete(n) = status {
+            buffer.consume(n);
+
+            let code = response.code.unwrap();
+            if code != 200 {
+                return Err(io::Error::new(
+                    io::ErrorKind::Other,
+                    format!("HTTP response {code}"),
+                ));
+            }
+
+            break get_content_length(response.headers)?;
+        }
+
+        slurp(buffer, sock).await?;
+    };
+
+    let buf_len = buffer.space().len() + buffer.data().len();
+
+    if body_len > buf_len as u64 {
+        return Err(io::Error::new(
+            io::ErrorKind::OutOfMemory,
+            "HTTP response body does not fit in buffer",
+        ));
+    }
+
+    let body_len = body_len as usize;
+
+    while buffer.data().len() < body_len {
+        slurp(buffer, sock).await?;
+    }
+
+    let data = buffer.data();
+    buffer.consume(body_len);
+
+    Ok(&data[..body_len])
+}
--- a/contrib/turbofetch/src/main.rs
+++ b/contrib/turbofetch/src/main.rs
@ -0,0 +1,220 @@
+//! turbofetch is a high-performance bulk S3 object aggregator.
+//!
+//! It operates on two S3 buckets: a source bucket (nix-cache), and a
+//! work bucket defined at runtime. The work bucket contains a job file
+//! consisting of concatenated 32-character keys, representing narinfo
+//! files in the source bucket, without the `.narinfo` suffix or any
+//! other separators.
+//!
+//! Each run of turbofetch processes a half-open range of indices from the
+//! job file, and outputs a zstd stream of concatenated objects, without
+//! additional separators and in no particular order. These segment files
+//! are written into the work bucket, named for the range of indices they
+//! cover. `/narinfo.zst/000000000c380d40-000000000c385b60` covers the 20k
+//! objects `[0xc380d40, 0xc385b60) = [205000000, 205020000)`. Empirically,
+//! segment files of 20k objects achieve a compression ratio of 4.7x.
+//!
+//! Reassembly is left to narinfo2parquet, which interprets StorePath lines.
+//!
+//! TODO(edef): any retries/error handling whatsoever
+//! Currently, it fails an entire range if anything goes wrong, and doesn't
+//! write any output.
+
+use bytes::Bytes;
+use futures::{stream::FuturesUnordered, Stream, TryStreamExt};
+use rusoto_core::ByteStream;
+use rusoto_s3::{GetObjectRequest, PutObjectRequest, S3Client, S3};
+use serde::Deserialize;
+use std::{io::Write, mem, ops::Range, ptr};
+use tokio::{
+    io::{self, AsyncReadExt, AsyncWriteExt},
+    net::TcpStream,
+};
+
+/// Fetch a group of keys, streaming concatenated chunks as they arrive from S3.
+/// `keys` must be a slice from the job file. Any network error at all fails the
+/// entire batch, and there is no rate limiting.
+fn fetch(keys: &[[u8; 32]]) -> impl Stream<Item = io::Result<Bytes>> {
+    // S3 supports only HTTP/1.1, but we can ease the pain somewhat by using
+    // HTTP pipelining. It terminates the TCP connection after receiving 100
+    // requests, so we chunk the keys up accordingly, and make one connection
+    // for each chunk.
+    keys.chunks(100)
+        .map(|chunk| {
+            const PREFIX: &[u8] = b"GET /nix-cache/";
+            const SUFFIX: &[u8] = b".narinfo HTTP/1.1\nHost: s3.amazonaws.com\n\n";
+            const LENGTH: usize = PREFIX.len() + 32 + SUFFIX.len();
+
+            let mut request = Vec::with_capacity(LENGTH * 100);
+            for key in chunk {
+                request.extend_from_slice(PREFIX);
+                request.extend_from_slice(key);
+                request.extend_from_slice(SUFFIX);
+            }
+
+            (request, chunk.len())
+        })
+        .map(|(request, n)| async move {
+            let (mut read, mut write) = TcpStream::connect("s3.amazonaws.com:80")
+                .await?
+                .into_split();
+
+            let _handle = tokio::spawn(async move {
+                let request = request;
+                write.write_all(&request).await
+            });
+
+            let mut buffer = turbofetch::Buffer::new(512 * 1024);
+            let mut bodies = vec![];
+
+            for _ in 0..n {
+                let body = turbofetch::parse_response(&mut read, &mut buffer).await?;
+                bodies.extend_from_slice(body);
+            }
+
+            Ok::<_, io::Error>(Bytes::from(bodies))
+        })
+        .collect::<FuturesUnordered<_>>()
+}
+
+/// Retrieve a range of keys from the job file.
+async fn get_range(
+    s3: &'static S3Client,
+    bucket: String,
+    key: String,
+    range: Range<u64>,
+) -> io::Result<Box<[[u8; 32]]>> {
+    let resp = s3
+        .get_object(GetObjectRequest {
+            bucket,
+            key,
+            range: Some(format!("bytes={}-{}", range.start * 32, range.end * 32 - 1)),
+            ..GetObjectRequest::default()
+        })
+        .await
+        .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
+
+    let mut body = vec![];
+    resp.body
+        .ok_or(io::ErrorKind::InvalidData)?
+        .into_async_read()
+        .read_to_end(&mut body)
+        .await?;
+
+    let body = exact_chunks(body.into_boxed_slice()).ok_or(io::ErrorKind::InvalidData)?;
+
+    Ok(body)
+}
+
+fn exact_chunks(mut buf: Box<[u8]>) -> Option<Box<[[u8; 32]]>> {
+    // SAFETY: We ensure that `buf.len()` is a multiple of 32, and there are no alignment requirements.
+    unsafe {
+        let ptr = buf.as_mut_ptr();
+        let len = buf.len();
+
+        if len % 32 != 0 {
+            return None;
+        }
+
+        let ptr = ptr as *mut [u8; 32];
+        let len = len / 32;
+        mem::forget(buf);
+
+        Some(Box::from_raw(ptr::slice_from_raw_parts_mut(ptr, len)))
+    }
+}
+
+// TODO(edef): factor this out into a separate entry point
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<(), lambda_runtime::Error> {
+    let s3 = S3Client::new(rusoto_core::Region::UsEast1);
+    let s3 = &*Box::leak(Box::new(s3));
+
+    tracing_subscriber::fmt()
+        .json()
+        .with_max_level(tracing::Level::INFO)
+        // this needs to be set to remove duplicated information in the log.
+        .with_current_span(false)
+        // this needs to be set to false, otherwise ANSI color codes will
+        // show up in a confusing manner in CloudWatch logs.
+        .with_ansi(false)
+        // disabling time is handy because CloudWatch will add the ingestion time.
+        .without_time()
+        // remove the name of the function from every log entry
+        .with_target(false)
+        .init();
+
+    lambda_runtime::run(lambda_runtime::service_fn(|event| func(s3, event))).await
+}
+
+/// Lambda request body
+#[derive(Debug, Deserialize)]
+struct Params {
+    work_bucket: String,
+    job_file: String,
+    start: u64,
+    end: u64,
+}
+
+#[tracing::instrument(skip(s3, event), fields(req_id = %event.context.request_id))]
+async fn func(
+    s3: &'static S3Client,
+    event: lambda_runtime::LambdaEvent<
+        aws_lambda_events::lambda_function_urls::LambdaFunctionUrlRequest,
+    >,
+) -> Result<&'static str, lambda_runtime::Error> {
+    let mut params = event.payload.body.ok_or("no body")?;
+
+    if event.payload.is_base64_encoded {
+        params = String::from_utf8(data_encoding::BASE64.decode(params.as_bytes())?)?;
+    }
+
+    let params: Params = serde_json::from_str(&params)?;
+
+    if params.start >= params.end {
+        return Err("nope".into());
+    }
+
+    let keys = get_range(
+        s3,
+        params.work_bucket.clone(),
+        params.job_file.to_owned(),
+        params.start..params.end,
+    )
+    .await?;
+
+    let zchunks = fetch(&keys)
+        .try_fold(
+            Box::new(zstd::Encoder::new(vec![], zstd::DEFAULT_COMPRESSION_LEVEL).unwrap()),
+            |mut w, buf| {
+                w.write_all(&buf).unwrap();
+                async { Ok(w) }
+            },
+        )
+        .await?;
+
+    let zchunks = to_byte_stream(zchunks.finish().unwrap());
+
+    tracing::info!("we got to put_object");
+
+    s3.put_object(PutObjectRequest {
+        bucket: params.work_bucket,
+        key: format!("narinfo.zst/{:016x}-{:016x}", params.start, params.end),
+        body: Some(zchunks),
+        ..Default::default()
+    })
+    .await
+    .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
+
+    tracing::info!("… and it worked!");
+
+    Ok("OK")
+}
+
+fn to_byte_stream(buffer: Vec<u8>) -> ByteStream {
+    let size_hint = buffer.len();
+    ByteStream::new_with_size(
+        futures::stream::once(async { Ok(buffer.into()) }),
+        size_hint,
+    )
+}
--- a/contrib/weave/Cargo.lock
+++ b/contrib/weave/Cargo.lock
--- a/contrib/weave/Cargo.nix
+++ b/contrib/weave/Cargo.nix
--- a/contrib/weave/Cargo.toml
+++ b/contrib/weave/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "weave"
+version = "0.1.0"
+edition = "2021"
+
+[workspace]
+members = ["."]
+
+# TODO(edef): cut down on required features, this is kind of a grab bag right now
+[dependencies]
+anyhow = { version = "1.0.79", features = ["backtrace"] }
+hashbrown = "0.14.3"
+nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
+safer_owning_ref = "0.5.0"
+rayon = "1.8.1"
+rustc-hash = "2.0.0"
+snix-tracing = { version = "0.1.0", path = "../../snix/tracing" }
+tracing = "0.1.40"
+tracing-indicatif = "0.3.6"
+
+[dependencies.polars]
+version = "0.36.2"
+features = ["parquet", "lazy", "streaming"]
--- a/contrib/weave/OWNERS
+++ b/contrib/weave/OWNERS
@ -0,0 +1 @@
+edef
--- a/contrib/weave/default.nix
+++ b/contrib/weave/default.nix
@ -0,0 +1,11 @@
+{ pkgs, depot, ... }:
+
+(pkgs.callPackage ./Cargo.nix {
+  defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
+    weave = prev: {
+      src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
+    };
+  };
+}).rootCrate.build.overrideAttrs {
+  meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
+}
--- a/contrib/weave/src/bin/swizzle.rs
+++ b/contrib/weave/src/bin/swizzle.rs
@ -0,0 +1,118 @@
+//! Swizzle reads a `narinfo.parquet` file, usually produced by `narinfo2parquet`.
+//!
+//! It swizzles the reference list, ie it converts the references from absolute,
+//! global identifiers (store path hashes) to indices into the `store_path_hash`
+//! column (ie, row numbers), so that we can later walk the reference graph
+//! efficiently.
+//!
+//! Path hashes are represented as non-null, 20-byte `Binary` values.
+//! The indices are represented as 32-bit unsigned integers, with in-band nulls
+//! represented by [INDEX_NULL] (the all-1 bit pattern), to permit swizzling
+//! partial datasets.
+//!
+//! In essence, it converts from names to pointers, so that `weave` can simply
+//! chase pointers to trace the live set. This replaces an `O(log(n))` lookup
+//! with `O(1)` indexing, and produces a much denser representation that actually
+//! fits in memory.
+//!
+//! The in-memory representation is at least 80% smaller, and the indices compress
+//! well in Parquet due to both temporal locality of reference and the power law
+//! distribution of reference "popularity".
+//!
+//! Only two columns are read from `narinfo.parquet`:
+//!
+//!  * `store_path_hash :: PathHash`
+//!  * `references :: List[PathHash]`
+//!
+//! Output is written to `narinfo-references.parquet` in the form of a single
+//! `List[u32]` column, `reference_idxs`.
+//!
+//! This file is inherently bound to the corresponding `narinfo.parquet`,
+//! since it essentially contains pointers into this file.
+
+use anyhow::Result;
+use hashbrown::HashTable;
+use polars::{
+    lazy::dsl::{col, SpecialEq},
+    prelude::*,
+};
+use tracing::info_span;
+use tracing_indicatif::span_ext::IndicatifSpanExt as _;
+
+use weave::{as_fixed_binary, hash64, leak, load_ph_array, INDEX_NULL};
+
+#[tracing::instrument]
+fn main() -> Result<()> {
+    let _tracing = snix_tracing::TracingBuilder::default()
+        .enable_progressbar()
+        .build()?;
+
+    let ph_array: &'static [[u8; 20]] = leak(load_ph_array()?);
+
+    // TODO(edef): re-parallelise this
+    // We originally parallelised on chunks, but ph_array is only a single chunk, due to how Parquet loading works.
+    // TODO(edef): outline the 64-bit hash prefix? it's an indirection, but it saves ~2G of memory
+    let ph_map: &'static HashTable<(u64, u32)> = {
+        let span = info_span!("ph_map", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("build index");
+        span.pb_start();
+
+        let mut ph_map = HashTable::with_capacity(ph_array.len());
+
+        for (offset, item) in ph_array.iter().enumerate() {
+            let offset = offset as u32;
+            let hash = hash64(item);
+            ph_map.insert_unique(hash, (hash, offset), |&(hash, _)| hash);
+        }
+
+        &*Box::leak(Box::new(ph_map))
+    };
+
+    let ph_to_idx = |key: &[u8; 20]| -> u32 {
+        let hash = hash64(key);
+        ph_map
+            .find(hash, |&(candidate_hash, candidate_index)| {
+                candidate_hash == hash && &ph_array[candidate_index as usize] == key
+            })
+            .map(|&(_, index)| index)
+            .unwrap_or(INDEX_NULL)
+    };
+
+    {
+        let span = info_span!("swizzle_refs", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("swizzle references");
+        span.pb_start();
+
+        LazyFrame::scan_parquet("narinfo.parquet", ScanArgsParquet::default())?
+            .with_column(
+                col("references")
+                    .map(
+                        move |series: Series| -> PolarsResult<Option<Series>> {
+                            Ok(Some(
+                                series
+                                    .list()?
+                                    .apply_to_inner(&|series: Series| -> PolarsResult<Series> {
+                                        let series = series.binary()?;
+                                        let mut out: Vec<u32> = Vec::with_capacity(series.len());
+                                        out.extend(
+                                            as_fixed_binary(series).flatten().map(ph_to_idx),
+                                        );
+                                        Ok(Series::from_vec("reference_idxs", out))
+                                    })?
+                                    .into_series(),
+                            ))
+                        },
+                        SpecialEq::from_type(DataType::List(DataType::UInt32.into())),
+                    )
+                    .alias("reference_idxs"),
+            )
+            .select([col("reference_idxs")])
+            .with_streaming(true)
+            .sink_parquet(
+                "narinfo-references.parquet".into(),
+                ParquetWriteOptions::default(),
+            )?;
+    };
+
+    Ok(())
+}
--- a/contrib/weave/src/lib.rs
+++ b/contrib/weave/src/lib.rs
@ -0,0 +1,133 @@
+use anyhow::Result;
+use owning_ref::{ArcRef, OwningRef};
+use rayon::prelude::*;
+use std::{
+    fs::File,
+    mem,
+    ops::{Deref, Range},
+    slice,
+    sync::Arc,
+};
+use tracing_indicatif::span_ext::IndicatifSpanExt as _;
+
+use polars::{
+    datatypes::BinaryChunked,
+    export::arrow::array::BinaryArray,
+    prelude::{ParquetReader, SerReader},
+};
+
+/// An shared `[[u8; N]]` backed by a Polars [Buffer].
+pub type FixedBytes<const N: usize> =
+    ArcRef<'static, polars::export::arrow::buffer::Bytes<u8>, [[u8; N]]>;
+
+pub const INDEX_NULL: u32 = !0;
+
+/// A terrific hash function, turning 20 bytes of cryptographic hash
+/// into 8 bytes of cryptographic hash.
+pub fn hash64(h: &[u8; 20]) -> u64 {
+    let mut buf = [0; 8];
+    buf.copy_from_slice(&h[..8]);
+    u64::from_ne_bytes(buf)
+}
+
+pub fn leak<O, T: ?Sized>(r: OwningRef<Arc<O>, T>) -> &T {
+    // SAFETY: Either `ptr` points into the `Arc`, which lives until `r` is dropped,
+    // or it points at something else entirely which lives at least as long.
+    unsafe {
+        let ptr: *const T = r.deref();
+        mem::forget(r);
+        &*ptr
+    }
+}
+
+/// Read a dense `store_path_hash` array from `narinfo.parquet`,
+/// returning it as an owned [FixedBytes].
+#[tracing::instrument(fields(indicatif.pb_show = tracing::field::Empty))]
+pub fn load_ph_array() -> Result<FixedBytes<20>> {
+    let span = tracing::Span::current();
+
+    span.pb_set_message("load store_path_hash");
+    span.pb_start();
+
+    // TODO(edef): this could use a further pushdown, since polars is more hindrance than help here
+    // We know this has to fit in memory (we can't mmap it without further encoding constraints),
+    // and we want a single `Vec<[u8; 20]>` of the data.
+    let ph_array = into_fixed_binary_rechunk::<20>(
+        ParquetReader::new(File::open("narinfo.parquet").unwrap())
+            .with_columns(Some(vec!["store_path_hash".into()]))
+            .set_rechunk(true)
+            .finish()?
+            .column("store_path_hash")?
+            .binary()?,
+    );
+
+    u32::try_from(ph_array.len()).expect("dataset exceeds 2^32");
+
+    Ok(ph_array)
+}
+
+/// Iterator over `&[[u8; N]]` from a dense [BinaryChunked].
+pub fn as_fixed_binary<const N: usize>(
+    chunked: &BinaryChunked,
+) -> impl DoubleEndedIterator<Item = &[[u8; N]]> {
+    chunked.downcast_iter().map(|array| {
+        let range = assert_fixed_dense::<N>(array);
+        exact_chunks(&array.values()[range]).unwrap()
+    })
+}
+
+/// Convert a dense [BinaryChunked] into a single chunk as [FixedBytes],
+/// without taking a reference to the offsets array and validity bitmap.
+fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBytes<N> {
+    let chunked = chunked.rechunk();
+    let mut iter = chunked.downcast_iter();
+    let array = iter.next().unwrap();
+    assert!(iter.next().is_none());
+
+    let (buf, off, len) = {
+        let range = assert_fixed_dense::<N>(array);
+        array.values().clone().sliced(range.start, range.len())
+    }
+    .into_inner();
+
+    ArcRef::new(buf).map(|bytes| exact_chunks(&bytes[off..off + len]).unwrap())
+}
+
+/// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
+/// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
+#[must_use = "only the range returned is guaranteed to be conformant"]
+fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
+    let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
+    if null_count > 0 {
+        panic!("null values present");
+    }
+
+    let offsets = array.offsets();
+    let length_check = offsets
+        .as_slice()
+        .par_windows(2)
+        .all(|w| (w[1] - w[0]) == N as i64);
+
+    if !length_check {
+        panic!("lengths are inconsistent");
+    }
+
+    (*offsets.first() as usize)..(*offsets.last() as usize)
+}
+
+fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {
+    // SAFETY: We ensure that `buf.len()` is a multiple of K, and there are no alignment requirements.
+    unsafe {
+        let ptr = buf.as_ptr();
+        let len = buf.len();
+
+        if len % K != 0 {
+            return None;
+        }
+
+        let ptr = ptr as *mut [u8; K];
+        let len = len / K;
+
+        Some(slice::from_raw_parts(ptr, len))
+    }
+}
--- a/contrib/weave/src/main.rs
+++ b/contrib/weave/src/main.rs
@ -0,0 +1,262 @@
+//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
+//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
+//! produced by `swizzle` to collect the closure of the roots.
+//!
+//! They are written to `live_idxs.parquet`, which only has one column, representing
+//! the row numbers in `narinfo.parquet` corresponding to live paths.
+
+use anyhow::Result;
+use hashbrown::{hash_table, HashTable};
+use rayon::prelude::*;
+use rustc_hash::FxHashSet;
+use std::{
+    collections::BTreeMap,
+    fs::File,
+    ops::Index,
+    sync::atomic::{AtomicU32, Ordering},
+};
+use tracing::{info_span, warn};
+use tracing_indicatif::span_ext::IndicatifSpanExt;
+
+use polars::{
+    datatypes::StaticArray,
+    export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
+    lazy::dsl::col,
+    prelude::*,
+};
+
+use weave::{as_fixed_binary, hash64, INDEX_NULL};
+
+#[tracing::instrument]
+fn main() -> Result<()> {
+    let _tracing = snix_tracing::TracingBuilder::default()
+        .enable_progressbar()
+        .build()?;
+
+    let roots: PathSet32 = {
+        let span = info_span!("parse_roots", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("parse roots");
+        span.pb_start();
+
+        as_fixed_binary::<20>(
+            LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
+                .explode([col("store_path_hash")])
+                .select([col("store_path_hash")])
+                .collect()?
+                .column("store_path_hash")?
+                .binary()?,
+        )
+        .flatten()
+        .collect()
+    };
+
+    {
+        let span = info_span!("resolve_roots", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("resolve roots");
+        span.pb_start();
+
+        weave::load_ph_array()?
+            .into_par_iter()
+            .enumerate()
+            .for_each(|(idx, h)| {
+                if let Some(idx_slot) = roots.find(h) {
+                    assert_eq!(
+                        idx_slot.swap(idx as u32, Ordering::Relaxed),
+                        INDEX_NULL,
+                        "duplicate entry"
+                    );
+                }
+            });
+    }
+
+    let mut todo = FxHashSet::default();
+    todo.reserve(roots.len());
+    {
+        let mut unknown_roots = 0usize;
+        for (_, idx) in roots.table {
+            let idx = idx.into_inner();
+            if idx == INDEX_NULL {
+                unknown_roots += 1;
+                continue;
+            }
+            todo.insert(idx);
+        }
+
+        if unknown_roots != 0 {
+            warn!("skipping {unknown_roots} unknown roots");
+        }
+    }
+
+    let ri_array;
+    let ri_array = {
+        let span = info_span!(
+            "load_reference_idxs",
+            indicatif.pb_show = tracing::field::Empty
+        )
+        .entered();
+        span.pb_set_message("load reference_idxs");
+        span.pb_start();
+
+        ri_array = ParquetReader::new(File::open("narinfo-references.parquet")?)
+            .finish()?
+            .column("reference_idxs")?
+            .list()?
+            .clone();
+
+        ChunkedList::new(ri_array.downcast_iter().map(|chunk| {
+            (
+                chunk.offsets(),
+                chunk
+                    .values()
+                    .as_any()
+                    .downcast_ref::<UInt32Array>()
+                    .unwrap()
+                    .as_slice()
+                    .unwrap(),
+            )
+        }))
+    };
+
+    let mut seen = todo.clone();
+    {
+        let span = info_span!("mark", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("marking");
+        span.pb_set_style(&snix_tracing::PB_PROGRESS_STYLE);
+
+        while !todo.is_empty() {
+            span.pb_set_length(seen.len() as u64);
+            span.pb_set_position(seen.len().saturating_sub(todo.len()) as u64);
+
+            todo = todo
+                .par_iter()
+                .flat_map(|&parent| {
+                    if parent == INDEX_NULL {
+                        return FxHashSet::default();
+                    }
+
+                    ri_array[parent as usize]
+                        .iter()
+                        .cloned()
+                        .filter(|child| !seen.contains(child))
+                        .collect::<FxHashSet<u32>>()
+                })
+                .collect();
+
+            for &index in &todo {
+                seen.insert(index);
+            }
+        }
+
+        span.pb_set_length(seen.len() as u64);
+        span.pb_set_position(seen.len() as u64);
+
+        if seen.remove(&INDEX_NULL) {
+            warn!("WARNING: missing edges");
+        }
+    }
+
+    let seen = {
+        let span = info_span!("gather_live", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("gathering live set");
+
+        let mut seen: Vec<u32> = seen.into_iter().collect();
+        seen.par_sort();
+        seen
+    };
+
+    {
+        let span = info_span!("write_output", indicatif.pb_show = tracing::field::Empty).entered();
+        span.pb_set_message("writing output");
+        span.pb_start();
+
+        ParquetWriter::new(File::create("live_idxs.parquet")?).finish(&mut df! {
+            "live_idx" => seen,
+        }?)?;
+    }
+
+    Ok(())
+}
+
+struct PathSet32 {
+    table: HashTable<([u8; 20], AtomicU32)>,
+}
+
+impl PathSet32 {
+    fn with_capacity(capacity: usize) -> Self {
+        Self {
+            table: HashTable::with_capacity(capacity),
+        }
+    }
+
+    fn insert(&mut self, value: &[u8; 20]) -> bool {
+        let hash = hash64(value);
+
+        match self
+            .table
+            .entry(hash, |(x, _)| x == value, |(x, _)| hash64(x))
+        {
+            hash_table::Entry::Occupied(_) => false,
+            hash_table::Entry::Vacant(entry) => {
+                entry.insert((*value, AtomicU32::new(INDEX_NULL)));
+                true
+            }
+        }
+    }
+
+    fn find(&self, value: &[u8; 20]) -> Option<&AtomicU32> {
+        let hash = hash64(value);
+        self.table
+            .find(hash, |(x, _)| x == value)
+            .as_ref()
+            .map(|(_, x)| x)
+    }
+
+    fn len(&self) -> usize {
+        self.table.len()
+    }
+}
+
+impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
+    fn from_iter<T: IntoIterator<Item = &'a [u8; 20]>>(iter: T) -> Self {
+        let iter = iter.into_iter();
+        let mut this = Self::with_capacity(iter.size_hint().0);
+
+        for item in iter {
+            this.insert(item);
+        }
+
+        this.table.shrink_to_fit(|(x, _)| hash64(x));
+        this
+    }
+}
+
+struct ChunkedList<'a, T> {
+    by_offset: BTreeMap<usize, (&'a OffsetsBuffer<i64>, &'a [T])>,
+}
+
+impl<'a, T> ChunkedList<'a, T> {
+    fn new(chunks: impl IntoIterator<Item = (&'a OffsetsBuffer<i64>, &'a [T])>) -> Self {
+        let mut next_offset = 0usize;
+        ChunkedList {
+            by_offset: chunks
+                .into_iter()
+                .map(|(offsets, values)| {
+                    let offset = next_offset;
+                    next_offset = next_offset.checked_add(offsets.len_proxy()).unwrap();
+
+                    (offset, (offsets, values))
+                })
+                .collect(),
+        }
+    }
+}
+
+impl<'a, T> Index<usize> for ChunkedList<'a, T> {
+    type Output = [T];
+
+    fn index(&self, index: usize) -> &Self::Output {
+        let (&base, &(offsets, values)) = self.by_offset.range(..=index).next_back().unwrap();
+        let (start, end) = offsets.start_end(index - base);
+        &values[start..end]
+    }
+}