chore(users/edef): move to contrib

Change-Id: I1a6972fab8ada26917f29607fc401e376d634070
This commit is contained in:
Florian Klink 2025-03-17 12:41:31 +00:00
parent a7916624dc
commit 403d8fc897
55 changed files with 15 additions and 17 deletions

1
contrib/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
target

3
contrib/OWNERS Normal file
View file

@ -0,0 +1,3 @@
set noparent
edef

1
contrib/crunch-v2/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.parquet

3193
contrib/crunch-v2/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

11745
contrib/crunch-v2/Cargo.nix Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,39 @@
[package]
name = "crunch-v2"
version = "0.1.0"
edition = "2021"
[workspace]
members = ["."]
[dependencies]
anyhow = { version = "1.0.75", features = ["backtrace"] }
lazy_static = "1.4.0"
bstr = "1.8.0"
bytes = "1.6.1"
futures = "0.3.29"
tokio = { version = "1.37.0", features = ["full"] }
rusoto_core = { version = "0.48.0", default-features = false, features = ["hyper-rustls"] }
rusoto_s3 = { version = "0.48.0", default-features = false, features = ["rustls"] }
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
sled = "0.34.7"
fastcdc = "3.1.0"
blake3 = "1.5.0"
sha2 = { version = "0.10.8", features = ["asm"] }
digest = "0.10.7"
bzip2 = "0.4.4"
xz2 = "0.1.7"
zstd = "0.13.0"
prost = "0.12.2"
polars = { version = "0.35.4", default-features = false, features = ["parquet", "lazy", "sql", "dtype-struct"] }
indicatif = "0.17.7"
clap = { version = "4.4.18", features = ["derive"] }
[build-dependencies]
prost-build = "0.12.2"

1
contrib/crunch-v2/OWNERS Normal file
View file

@ -0,0 +1 @@
edef

View file

@ -0,0 +1,6 @@
use std::io::Result;
fn main() -> Result<()> {
prost_build::compile_protos(&["protos/flatstore.proto"], &["protos/"])?;
Ok(())
}

View file

@ -0,0 +1,15 @@
{ pkgs, depot, lib, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
crunch-v2 = prev: {
src = depot.snix.utils.filterRustCrateSrc rec {
root = prev.src.origSrc;
extraFileset = lib.fileset.fileFilter (f: f.hasExt "proto") root;
};
nativeBuildInputs = [ pkgs.protobuf ];
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

View file

@ -0,0 +1,38 @@
syntax = "proto3";
package snix.flatstore.v1;
message Path {
bytes nar_hash = 1;
oneof node {
DirectoryNode directory = 2;
FileNode file = 3;
SymlinkNode symlink = 4;
}
}
message DirectoryNode {
bytes name = 1;
repeated DirectoryNode directories = 2;
repeated FileNode files = 3;
repeated SymlinkNode symlinks = 4;
}
message FileNode {
bytes name = 1;
bytes hash = 2;
repeated Chunk chunks = 3;
bool executable = 4;
}
message Chunk {
bytes hash = 1;
uint32 size = 2;
uint32 size_compressed = 3;
}
message SymlinkNode {
bytes name = 1;
bytes target = 2;
}

View file

@ -0,0 +1,155 @@
//! This tool lossily converts a Sled database produced by crunch-v2 into a Parquet file for analysis.
//! The resulting `crunch.parquet` has columns file_hash`, `nar_hash`, and `chunk`.
//! The first two are SHA-256 hashes of the compressed file and the NAR it decompresses to.
//! `chunk` is a struct array corresponding to [crunch_v2::proto::Chunk] messages.
//! They are concatenated without any additional structure, so nothing but the chunk list is preserved.
use anyhow::Result;
use clap::Parser;
use indicatif::{ProgressBar, ProgressStyle};
use std::fs::File;
use std::path::PathBuf;
use crunch_v2::proto::{self, path::Node};
use prost::Message;
use polars::{
chunked_array::builder::AnonymousOwnedListBuilder,
prelude::{
df, BinaryChunkedBuilder, ChunkedBuilder, DataFrame, DataType, Field, ListBuilderTrait,
NamedFrom, ParquetWriter, PrimitiveChunkedBuilder, Series, UInt32Type,
},
series::IntoSeries,
};
#[derive(Parser)]
struct Args {
/// Path to the sled database that's read from.
#[clap(default_value = "crunch.db")]
infile: PathBuf,
/// Path to the resulting parquet file that's written.
#[clap(default_value = "crunch.parquet")]
outfile: PathBuf,
}
fn main() -> Result<()> {
let args = Args::parse();
let w = ParquetWriter::new(File::create(args.outfile)?);
let db: sled::Db = sled::open(&args.infile).unwrap();
let files_tree: sled::Tree = db.open_tree("files").unwrap();
let progress =
ProgressBar::new(files_tree.len() as u64).with_style(ProgressStyle::with_template(
"{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
)?);
let mut frame = FrameBuilder::new();
for entry in &files_tree {
let (file_hash, pb) = entry?;
frame.push(
file_hash[..].try_into().unwrap(),
proto::Path::decode(&pb[..])?,
);
progress.inc(1);
}
w.finish(&mut frame.finish())?;
Ok(())
}
struct FrameBuilder {
file_hash: BinaryChunkedBuilder,
nar_hash: BinaryChunkedBuilder,
chunk: AnonymousOwnedListBuilder,
}
impl FrameBuilder {
fn new() -> Self {
Self {
file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
chunk: AnonymousOwnedListBuilder::new(
"chunk",
0,
Some(DataType::Struct(vec![
Field::new("hash", DataType::Binary),
Field::new("size", DataType::UInt32),
Field::new("size_compressed", DataType::UInt32),
])),
),
}
}
fn push(&mut self, file_hash: [u8; 32], pb: proto::Path) {
self.file_hash.append_value(&file_hash[..]);
self.nar_hash.append_value(pb.nar_hash);
self.chunk
.append_series(&ChunkFrameBuilder::new(pb.node.unwrap()))
.unwrap();
}
fn finish(mut self) -> DataFrame {
df! {
"file_hash" => self.file_hash.finish().into_series(),
"nar_hash" => self.nar_hash.finish().into_series(),
"chunk" => self.chunk.finish().into_series()
}
.unwrap()
}
}
struct ChunkFrameBuilder {
hash: BinaryChunkedBuilder,
size: PrimitiveChunkedBuilder<UInt32Type>,
size_compressed: PrimitiveChunkedBuilder<UInt32Type>,
}
impl ChunkFrameBuilder {
fn new(node: proto::path::Node) -> Series {
let mut this = Self {
hash: BinaryChunkedBuilder::new("hash", 0, 0),
size: PrimitiveChunkedBuilder::new("size", 0),
size_compressed: PrimitiveChunkedBuilder::new("size_compressed", 0),
};
this.push(node);
this.finish()
}
fn push(&mut self, node: Node) {
match node {
Node::Directory(node) => {
for node in node.files {
self.push(Node::File(node));
}
for node in node.directories {
self.push(Node::Directory(node));
}
}
Node::File(node) => {
for chunk in node.chunks {
self.hash.append_value(&chunk.hash);
self.size.append_value(chunk.size);
self.size_compressed.append_value(chunk.size_compressed);
}
}
Node::Symlink(_) => {}
}
}
fn finish(self) -> Series {
df! {
"hash" => self.hash.finish().into_series(),
"size" => self.size.finish().into_series(),
"size_compressed" => self.size_compressed.finish().into_series()
}
.unwrap()
.into_struct("chunk")
.into_series()
}
}

View file

@ -0,0 +1,3 @@
pub mod proto {
include!(concat!(env!("OUT_DIR"), "/snix.flatstore.v1.rs"));
}

View file

@ -0,0 +1,309 @@
//! This is a tool for ingesting subsets of cache.nixos.org into its own flattened castore format.
//! Currently, produced chunks are not preserved, and this purely serves as a way of measuring
//! compression/deduplication ratios for various chunking and compression parameters.
//!
//! NARs to be ingested are read from `ingest.parquet`, and filtered by an SQL expression provided as a program argument.
//! The `file_hash` column should contain SHA-256 hashes of the compressed data, corresponding to the `FileHash` narinfo field.
//! The `compression` column should contain either `"bzip2"` or `"xz"`, corresponding to the `Compression` narinfo field.
//! Additional columns are ignored, but can be used by the SQL filter expression.
//!
//! flatstore protobufs are written to a sled database named `crunch.db`, addressed by file hash.
use crunch_v2::proto;
mod remote;
use anyhow::Result;
use clap::Parser;
use futures::{stream, StreamExt, TryStreamExt};
use indicatif::{ProgressBar, ProgressStyle};
use std::{
io::{self, BufRead, Read, Write},
path::PathBuf,
ptr,
};
use polars::{
prelude::{col, LazyFrame, ScanArgsParquet},
sql::sql_expr,
};
use fastcdc::v2020::{ChunkData, StreamCDC};
use nix_compat::nar::reader as nar;
use digest::Digest;
use prost::Message;
use sha2::Sha256;
#[derive(Parser)]
struct Args {
/// Path to an existing parquet file.
/// The `file_hash` column should contain SHA-256 hashes of the compressed
/// data, corresponding to the `FileHash` narinfo field.
/// The `compression` column should contain either `"bzip2"` or `"xz"`,
/// corresponding to the `Compression` narinfo field.
/// Additional columns are ignored, but can be used by the SQL filter expression.
#[clap(long, default_value = "ingest.parquet")]
infile: PathBuf,
/// Filter expression to filter elements in the parquet file for.
filter: String,
/// Average chunk size for FastCDC, in KiB.
/// min value is half, max value double of that number.
#[clap(long, default_value_t = 256)]
avg_chunk_size: u32,
/// Path to the sled database where results are written to (flatstore
/// protobufs, addressed by file hash).
#[clap(long, default_value = "crunch.db")]
outfile: PathBuf,
}
#[tokio::main]
async fn main() -> Result<()> {
let args = Args::parse();
let filter = sql_expr(args.filter)?;
let avg_chunk_size = args.avg_chunk_size * 1024;
let df = LazyFrame::scan_parquet(&args.infile, ScanArgsParquet::default())?
.filter(filter)
.select([col("file_hash"), col("compression")])
.drop_nulls(None)
.collect()?;
let progress = ProgressBar::new(df.height() as u64).with_style(ProgressStyle::with_template(
"{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
)?);
let file_hash = df
.column("file_hash")?
.binary()?
.into_iter()
.map(|h| -> [u8; 32] { h.unwrap().try_into().unwrap() });
let compression = df
.column("compression")?
.utf8()?
.into_iter()
.map(|c| c.unwrap());
let db: sled::Db = sled::open(args.outfile).unwrap();
let files_tree = db.open_tree("files").unwrap();
let res = stream::iter(file_hash.zip(compression))
.map(Ok)
.try_for_each_concurrent(Some(16), |(file_hash, compression)| {
let progress = progress.clone();
let files_tree = files_tree.clone();
async move {
if files_tree.contains_key(&file_hash)? {
progress.inc(1);
return Ok(());
}
let reader = remote::nar(file_hash, compression).await?;
tokio::task::spawn_blocking(move || {
let mut reader = Sha256Reader::from(reader);
let path =
ingest(nar::open(&mut reader)?, vec![], avg_chunk_size).map(|node| {
proto::Path {
nar_hash: reader.finalize().as_slice().into(),
node: Some(node),
}
})?;
files_tree.insert(file_hash, path.encode_to_vec())?;
progress.inc(1);
Ok::<_, anyhow::Error>(())
})
.await?
}
})
.await;
let flush = files_tree.flush_async().await;
res?;
flush?;
Ok(())
}
fn ingest(node: nar::Node, name: Vec<u8>, avg_chunk_size: u32) -> Result<proto::path::Node> {
match node {
nar::Node::Symlink { target } => Ok(proto::path::Node::Symlink(proto::SymlinkNode {
name,
target,
})),
nar::Node::Directory(mut reader) => {
let mut directories = vec![];
let mut files = vec![];
let mut symlinks = vec![];
while let Some(node) = reader.next()? {
match ingest(node.node, node.name.to_owned(), avg_chunk_size)? {
proto::path::Node::Directory(node) => {
directories.push(node);
}
proto::path::Node::File(node) => {
files.push(node);
}
proto::path::Node::Symlink(node) => {
symlinks.push(node);
}
}
}
Ok(proto::path::Node::Directory(proto::DirectoryNode {
name,
directories,
files,
symlinks,
}))
}
nar::Node::File { executable, reader } => {
let mut reader = B3Reader::from(reader);
let mut chunks = vec![];
for chunk in StreamCDC::new(
&mut reader,
avg_chunk_size / 2,
avg_chunk_size,
avg_chunk_size * 2,
) {
let ChunkData {
length: size, data, ..
} = chunk?;
let hash = blake3::hash(&data);
let size_compressed = zstd_size(&data, 9);
chunks.push(proto::Chunk {
hash: hash.as_bytes().as_slice().into(),
size: size.try_into().unwrap(),
size_compressed: size_compressed.try_into().unwrap(),
});
}
Ok(proto::path::Node::File(proto::FileNode {
name,
hash: reader.finalize().as_bytes().as_slice().into(),
chunks,
executable,
}))
}
}
}
struct Sha256Reader<R> {
inner: R,
hasher: Sha256,
buf: *const [u8],
}
const ZERO_BUF: *const [u8] = ptr::slice_from_raw_parts(1 as *const u8, 0);
unsafe impl<R: Send> Send for Sha256Reader<R> {}
impl<R> From<R> for Sha256Reader<R> {
fn from(value: R) -> Self {
Self {
inner: value,
hasher: Sha256::new(),
buf: ZERO_BUF,
}
}
}
impl<R: Read> Read for Sha256Reader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.buf = ZERO_BUF;
let n = self.inner.read(buf)?;
self.hasher.update(&buf[..n]);
Ok(n)
}
}
impl<R: BufRead> BufRead for Sha256Reader<R> {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
self.buf = ZERO_BUF;
let buf = self.inner.fill_buf()?;
self.buf = buf as *const [u8];
Ok(buf)
}
fn consume(&mut self, amt: usize) {
// UNSAFETY: This assumes that `R::consume` doesn't invalidate the buffer.
// That's not a sound assumption in general, though it is likely to hold.
// TODO(edef): refactor this codebase to write a fresh NAR for verification purposes
// we already buffer full chunks, so there's no pressing need to reuse the input buffers
unsafe {
let (head, buf) = (*self.buf).split_at(amt);
self.buf = buf as *const [u8];
self.hasher.update(head);
self.inner.consume(amt);
}
}
}
impl<R> Sha256Reader<R> {
fn finalize(self) -> [u8; 32] {
self.hasher.finalize().into()
}
}
struct B3Reader<R> {
inner: R,
hasher: blake3::Hasher,
}
impl<R> From<R> for B3Reader<R> {
fn from(value: R) -> Self {
Self {
inner: value,
hasher: blake3::Hasher::new(),
}
}
}
impl<R: Read> Read for B3Reader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let n = self.inner.read(buf)?;
self.hasher.update(&buf[..n]);
Ok(n)
}
}
impl<R> B3Reader<R> {
fn finalize(self) -> blake3::Hash {
self.hasher.finalize()
}
}
fn zstd_size(data: &[u8], level: i32) -> u64 {
let mut w = zstd::Encoder::new(CountingWriter::default(), level).unwrap();
w.write_all(&data).unwrap();
let CountingWriter(size) = w.finish().unwrap();
size
}
#[derive(Default)]
struct CountingWriter(u64);
impl Write for CountingWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.0 += buf.len() as u64;
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}

View file

@ -0,0 +1,211 @@
use std::{
cmp,
io::{self, BufRead, BufReader, Read},
pin::Pin,
task::{self, Poll},
};
use anyhow::{bail, Result};
use bytes::{Buf, Bytes};
use futures::{future::BoxFuture, Future, FutureExt, Stream, StreamExt};
use lazy_static::lazy_static;
use tokio::runtime::Handle;
use nix_compat::nixbase32;
use rusoto_core::{ByteStream, Region};
use rusoto_s3::{GetObjectOutput, GetObjectRequest, S3Client, S3};
use bzip2::read::BzDecoder;
use xz2::read::XzDecoder;
lazy_static! {
static ref S3_CLIENT: S3Client = S3Client::new(Region::UsEast1);
}
const BUCKET: &str = "nix-cache";
pub async fn nar(
file_hash: [u8; 32],
compression: &str,
) -> Result<Box<BufReader<dyn Read + Send>>> {
let (extension, decompress): (&'static str, fn(_) -> Box<_>) = match compression {
"bzip2" => ("bz2", decompress_bz2),
"xz" => ("xz", decompress_xz),
_ => bail!("unknown compression: {compression}"),
};
Ok(decompress(
FileStream::new(FileKey {
file_hash,
extension,
})
.await?
.into(),
))
}
fn decompress_xz(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
Box::new(BufReader::new(XzDecoder::new(reader)))
}
fn decompress_bz2(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
Box::new(BufReader::new(BzDecoder::new(reader)))
}
struct FileStreamReader {
inner: FileStream,
buffer: Bytes,
}
impl From<FileStream> for FileStreamReader {
fn from(value: FileStream) -> Self {
FileStreamReader {
inner: value,
buffer: Bytes::new(),
}
}
}
impl Read for FileStreamReader {
fn read(&mut self, dst: &mut [u8]) -> io::Result<usize> {
let src = self.fill_buf()?;
let n = cmp::min(src.len(), dst.len());
dst[..n].copy_from_slice(&src[..n]);
self.consume(n);
Ok(n)
}
}
impl BufRead for FileStreamReader {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
if !self.buffer.is_empty() {
return Ok(&self.buffer);
}
self.buffer = Handle::current()
.block_on(self.inner.next())
.transpose()?
.unwrap_or_default();
Ok(&self.buffer)
}
fn consume(&mut self, cnt: usize) {
self.buffer.advance(cnt);
}
}
struct FileKey {
file_hash: [u8; 32],
extension: &'static str,
}
impl FileKey {
fn get(
&self,
offset: u64,
e_tag: Option<&str>,
) -> impl Future<Output = io::Result<GetObjectOutput>> + Send + 'static {
let input = GetObjectRequest {
bucket: BUCKET.to_string(),
key: format!(
"nar/{}.nar.{}",
nixbase32::encode(&self.file_hash),
self.extension
),
if_match: e_tag.map(str::to_owned),
range: Some(format!("bytes {}-", offset + 1)),
..Default::default()
};
async {
S3_CLIENT
.get_object(input)
.await
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))
}
}
}
struct FileStream {
key: FileKey,
e_tag: String,
offset: u64,
length: u64,
inner: FileStreamState,
}
enum FileStreamState {
Response(BoxFuture<'static, io::Result<GetObjectOutput>>),
Body(ByteStream),
Eof,
}
impl FileStream {
pub async fn new(key: FileKey) -> io::Result<Self> {
let resp = key.get(0, None).await?;
Ok(FileStream {
key,
e_tag: resp.e_tag.unwrap(),
offset: 0,
length: resp.content_length.unwrap().try_into().unwrap(),
inner: FileStreamState::Body(resp.body.unwrap()),
})
}
}
macro_rules! poll {
($expr:expr) => {
match $expr {
Poll::Pending => {
return Poll::Pending;
}
Poll::Ready(value) => value,
}
};
}
impl Stream for FileStream {
type Item = io::Result<Bytes>;
fn poll_next(self: Pin<&mut Self>, cx: &mut task::Context) -> Poll<Option<Self::Item>> {
let this = self.get_mut();
let chunk = loop {
match &mut this.inner {
FileStreamState::Response(resp) => match poll!(resp.poll_unpin(cx)) {
Err(err) => {
this.inner = FileStreamState::Eof;
return Poll::Ready(Some(Err(err)));
}
Ok(resp) => {
this.inner = FileStreamState::Body(resp.body.unwrap());
}
},
FileStreamState::Body(body) => match poll!(body.poll_next_unpin(cx)) {
None | Some(Err(_)) => {
this.inner = FileStreamState::Response(
this.key.get(this.offset, Some(&this.e_tag)).boxed(),
);
}
Some(Ok(chunk)) => {
break chunk;
}
},
FileStreamState::Eof => {
return Poll::Ready(None);
}
}
};
this.offset += chunk.len() as u64;
if this.offset >= this.length {
this.inner = FileStreamState::Eof;
}
Poll::Ready(Some(Ok(chunk)))
}
}

2
contrib/fetchroots/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
/roots.parquet

3417
contrib/fetchroots/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

12830
contrib/fetchroots/Cargo.nix Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
[package]
name = "fetchroots"
version = "0.0.0"
edition = "2021"
[workspace]
members = ["."]
[dependencies]
anyhow = { version = "1.0.80", features = ["backtrace"] }
aws-config = "1.1.6"
aws-sdk-s3 = "1.16.0"
bytes = "1.5.0"
bytes-utils = "0.1.4"
bzip2 = "0.4.4"
chrono = "0.4.34"
futures = "0.3.30"
indicatif = "0.17.8"
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
polars = { version = "0.36.2", features = ["parquet"] }
rayon = "1.8.1"
tokio = { version = "1.36.0", features = ["full"] }
xz2 = "0.1.7"

View file

@ -0,0 +1,36 @@
# fetchroots
> This tool is part of a suite of tools built to manage cache.nixos.org.
This tool's purpose is to build an index of all the GC roots from the
channels.nixos.org releases. The result is then combined with other tools.
It does this by:
1. Listing all the release files in the bucket.
2. Getting the data for each of the release.
3. Putting them in a local parquet file.
## Getting started
In order to run this, you'll need AWS SSO credentials from the NixOS Infra team.
Get the creds from https://nixos.awsapps.com/start/ -> LBNixOS_Dev_PDX -> AWSReadOnlyAccess.
Run `mg run`, you should see a progress bar.
Congrats, you now have a `roots.parquet` file. You can now load it with python polars-rs or clickhouse.
## `roots.parquet` file format
* `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
* `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
* `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
## Development
When the Cargo.lock changes, run `mg run //tools:crate2nix-generate`.
To build the project, run `mg build`.
To get a dev environment, run `nix-shell -p cargo`.

View file

@ -0,0 +1,11 @@
{ pkgs, depot, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
fetchroots = prev: {
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

View file

@ -0,0 +1,257 @@
//! Fetch all[^1] GC roots from releases.nixos.org into a `roots.parquet` file.
//!
//! The resulting Parquet has three columns:
//!
//! * `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
//! * `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
//! * `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
//!
//! [^1]: some roots are truly ancient, and aren't compatible with Nix 1.x
use anyhow::Result;
use std::{
collections::BTreeMap,
fs::File,
io::{BufRead, Read},
sync::Arc,
time::SystemTime,
};
use aws_config::Region;
use aws_sdk_s3::operation::get_object::builders::GetObjectFluentBuilder;
use bytes::{Buf, Bytes};
use bytes_utils::SegmentedBuf;
use chrono::{DateTime, Utc};
use nix_compat::nixbase32;
use polars::prelude::*;
use tokio::{
sync::Semaphore,
task::{block_in_place, JoinSet},
};
#[derive(Debug)]
struct Meta {
format: Format,
e_tag: String,
last_modified: DateTime<Utc>,
}
#[tokio::main]
async fn main() {
let sdk_config = aws_config::load_defaults(aws_config::BehaviorVersion::v2023_11_09())
.await
.into_builder()
.region(Region::from_static("eu-west-1"))
.build();
let s3 = aws_sdk_s3::Client::new(&sdk_config);
let mut keys: BTreeMap<String, Meta> = {
let pages = s3
.list_objects_v2()
.bucket("nix-releases")
.into_paginator()
.send()
.try_collect()
.await
.unwrap();
let objects = pages.into_iter().flat_map(|page| {
assert_eq!(page.prefix().unwrap_or_default(), "");
assert!(page.common_prefixes.is_none());
page.contents.unwrap_or_default()
});
let mut prev_key = String::new();
objects
.filter_map(|obj| {
let key = obj.key().unwrap();
assert!(&*prev_key < key);
key.clone_into(&mut prev_key);
let (key, tail) = key.rsplit_once('/')?;
// Our preference order happens to match lexicographical order,
// and listings are returned in lexicographical order.
let format = match tail {
"MANIFEST" => Format::Manifest,
"MANIFEST.bz2" => Format::ManifestBz,
"store-paths.xz" => Format::StorePathsXz,
_ => return None,
};
Some((
key.to_owned(),
Meta {
format,
e_tag: obj.e_tag.unwrap(),
last_modified: SystemTime::try_from(obj.last_modified.unwrap())
.unwrap()
.into(),
},
))
})
.collect()
};
// These releases are so old they don't even use nixbase32 store paths.
for key in [
"nix/nix-0.6",
"nix/nix-0.6.1",
"nix/nix-0.7",
"nix/nix-0.8",
"nixpkgs/nixpkgs-0.5",
"nixpkgs/nixpkgs-0.5.1",
"nixpkgs/nixpkgs-0.6",
"nixpkgs/nixpkgs-0.7",
"nixpkgs/nixpkgs-0.8",
"nixpkgs/nixpkgs-0.9",
"nixpkgs/nixpkgs-0.10",
"nixpkgs/nixpkgs-0.11",
] {
assert!(keys.remove(key).is_some());
}
let mut js = JoinSet::new();
let sem = Arc::new(Semaphore::new(16));
let bar = indicatif::ProgressBar::new(keys.len() as u64);
for (root, meta) in keys {
let sem = sem.clone();
let s3 = s3.clone();
js.spawn(async move {
let _permit = sem.acquire().await.unwrap();
let body = get_object(
s3.get_object()
.bucket("nix-releases")
.key(format!("{root}/{}", meta.format.as_str()))
.if_match(meta.e_tag),
)
.await
.unwrap()
.reader();
let ph_array = block_in_place(|| meta.format.to_ph_array(body).rechunk());
df! {
"key" => [root],
"timestamp" => [meta.last_modified.naive_utc()],
"store_path_hash" => ph_array.into_series().implode().unwrap()
}
.unwrap()
});
}
let mut writer = ParquetWriter::new(File::create("roots.parquet").unwrap())
.batched(&Schema::from_iter([
Field::new("key", DataType::String),
Field::new(
"timestamp",
DataType::Datetime(TimeUnit::Milliseconds, None),
),
Field::new(
"store_path_hash",
DataType::List(Box::new(DataType::Binary)),
),
]))
.unwrap();
while let Some(df) = js.join_next().await.transpose().unwrap() {
block_in_place(|| writer.write_batch(&df)).unwrap();
bar.inc(1);
}
writer.finish().unwrap();
}
#[derive(Debug)]
enum Format {
Manifest,
ManifestBz,
StorePathsXz,
}
impl Format {
fn as_str(&self) -> &'static str {
match self {
Format::Manifest => "MANIFEST",
Format::ManifestBz => "MANIFEST.bz2",
Format::StorePathsXz => "store-paths.xz",
}
}
fn to_ph_array(&self, mut body: impl BufRead) -> BinaryChunked {
match self {
Format::Manifest | Format::ManifestBz => {
let mut buf = String::new();
match self {
Format::Manifest => {
body.read_to_string(&mut buf).unwrap();
}
Format::ManifestBz => {
bzip2::bufread::BzDecoder::new(body)
.read_to_string(&mut buf)
.unwrap();
}
_ => unreachable!(),
}
let buf = buf
.strip_prefix("version {\n ManifestVersion: 3\n}\n")
.unwrap();
BinaryChunked::from_iter_values(
"store_path_hash",
buf.split_terminator("}\n").map(|chunk| -> [u8; 20] {
let chunk = chunk.strip_prefix("patch ").unwrap_or(chunk);
let line = chunk.strip_prefix("{\n StorePath: /nix/store/").unwrap();
nixbase32::decode_fixed(&line[..32]).unwrap()
}),
)
}
Format::StorePathsXz => {
let mut buf = String::new();
xz2::bufread::XzDecoder::new(body)
.read_to_string(&mut buf)
.unwrap();
BinaryChunked::from_iter_values(
"store_path_hash",
buf.split_terminator('\n').map(|line| -> [u8; 20] {
let line = line.strip_prefix("/nix/store/").unwrap();
nixbase32::decode_fixed(&line[..32]).unwrap()
}),
)
}
}
}
}
async fn get_object(request: GetObjectFluentBuilder) -> Result<SegmentedBuf<Bytes>> {
// if we don't constrain the ETag, we might experience read skew
assert!(request.get_if_match().is_some(), "if_match must be set");
let mut buf: SegmentedBuf<Bytes> = SegmentedBuf::new();
let mut resp = request.clone().send().await?;
let content_length: usize = resp.content_length.unwrap().try_into().unwrap();
loop {
while let Ok(Some(chunk)) = resp.body.try_next().await {
buf.push(chunk);
}
if buf.remaining() >= content_length {
assert_eq!(buf.remaining(), content_length, "got excess bytes");
break Ok(buf);
}
resp = request
.clone()
.range(format!("bytes={}-", buf.remaining()))
.send()
.await?;
assert_ne!(resp.content_range, None);
}
}

2304
contrib/narinfo2parquet/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,28 @@
[package]
name = "narinfo2parquet"
version = "0.1.0"
edition = "2021"
# We can't join the //tvix workspace, because that locks zstd
# at an ancient version, which is incompatible with polars
[workspace]
members = ["."]
[dependencies]
anyhow = { version = "1.0.75", features = ["backtrace"] }
jemallocator = "0.5.4"
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
tempfile-fast = "0.3.4"
zstd = "0.13.0"
# See https://github.com/pola-rs/polars/issues/19157
hashbrown = { version = "0.14.5", features = ["raw"] }
[dependencies.polars]
version = "0.36.2"
default-features = false
features = [
"parquet",
"polars-io",
"dtype-categorical"
]

View file

@ -0,0 +1 @@
edef

View file

@ -0,0 +1,11 @@
{ pkgs, depot, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
narinfo2parquet = prev: {
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

View file

@ -0,0 +1,264 @@
//! narinfo2parquet operates on a narinfo.zst directory produced by turbofetch.
//! It takes the name of a segment file in `narinfo.zst` and writes a Parquet file
//! with the same name into the `narinfo.pq` directory.
//!
//! Run it under GNU Parallel for parallelism:
//! ```shell
//! mkdir narinfo.pq && ls narinfo.zst | parallel --bar 'narinfo2parquet {}'
//! ```
use anyhow::{bail, Context, Result};
use jemallocator::Jemalloc;
use nix_compat::{
narinfo::{self, NarInfo},
nixbase32,
};
use polars::{io::parquet::ParquetWriter, prelude::*};
use std::{
fs::{self, File},
io::{self, BufRead, BufReader, Read},
path::Path,
};
use tempfile_fast::PersistableTempFile;
#[global_allocator]
static GLOBAL: Jemalloc = Jemalloc;
fn main() -> Result<()> {
let file_name = std::env::args().nth(1).expect("file name missing");
let input_path = Path::new("narinfo.zst").join(&file_name);
let output_path = Path::new("narinfo.pq").join(&file_name);
match fs::metadata(&output_path) {
Err(e) if e.kind() == io::ErrorKind::NotFound => {}
Err(e) => bail!(e),
Ok(_) => bail!("output path already exists: {output_path:?}"),
}
let reader = File::open(input_path).and_then(zstd::Decoder::new)?;
let mut frame = FrameBuilder::default();
for_each(reader, |s| {
let entry = NarInfo::parse(&s).context("couldn't parse entry:\n{s}")?;
frame.push(&entry);
Ok(())
})?;
let mut frame = frame.finish();
let mut writer = PersistableTempFile::new_in(output_path.parent().unwrap())?;
ParquetWriter::new(&mut writer)
.with_compression(ParquetCompression::Gzip(None))
.with_statistics(true)
.finish(frame.align_chunks())?;
writer
.persist_noclobber(output_path)
.map_err(|e| e.error)
.context("couldn't commit output file")?;
Ok(())
}
fn for_each(reader: impl Read, mut f: impl FnMut(&str) -> Result<()>) -> Result<()> {
let mut reader = BufReader::new(reader);
let mut group = String::new();
loop {
let prev_len = group.len();
if prev_len > 1024 * 1024 {
bail!("excessively large segment");
}
reader.read_line(&mut group)?;
let (prev, line) = group.split_at(prev_len);
// EOF
if line.is_empty() {
break;
}
// skip empty line
if line == "\n" {
group.pop().unwrap();
continue;
}
if !prev.is_empty() && line.starts_with("StorePath:") {
f(prev)?;
group.drain(..prev_len);
}
}
if !group.is_empty() {
f(&group)?;
}
Ok(())
}
/// [FrameBuilder] builds a [DataFrame] out of [NarInfo]s.
/// The exact format is still in flux.
///
/// # Example
///
/// ```no_run
/// |narinfos: &[NarInfo]| -> DataFrame {
/// let frame_builder = FrameBuilder::default();
/// narinfos.for_each(|n| frame_builder.push(n));
/// frame_builder.finish()
/// }
/// ```
struct FrameBuilder {
store_path_hash_str: StringChunkedBuilder,
store_path_hash: BinaryChunkedBuilder,
store_path_name: StringChunkedBuilder,
deriver_hash_str: StringChunkedBuilder,
deriver_hash: BinaryChunkedBuilder,
deriver_name: StringChunkedBuilder,
nar_hash: BinaryChunkedBuilder,
nar_size: PrimitiveChunkedBuilder<UInt64Type>,
references: ListBinaryChunkedBuilder,
ca_algo: CategoricalChunkedBuilder<'static>,
ca_hash: BinaryChunkedBuilder,
signature: BinaryChunkedBuilder,
file_hash: BinaryChunkedBuilder,
file_size: PrimitiveChunkedBuilder<UInt64Type>,
compression: CategoricalChunkedBuilder<'static>,
quirk_references_out_of_order: BooleanChunkedBuilder,
quirk_nar_hash_hex: BooleanChunkedBuilder,
}
impl Default for FrameBuilder {
fn default() -> Self {
Self {
store_path_hash_str: StringChunkedBuilder::new("store_path_hash_str", 0, 0),
store_path_hash: BinaryChunkedBuilder::new("store_path_hash", 0, 0),
store_path_name: StringChunkedBuilder::new("store_path_name", 0, 0),
deriver_hash_str: StringChunkedBuilder::new("deriver_hash_str", 0, 0),
deriver_hash: BinaryChunkedBuilder::new("deriver_hash", 0, 0),
deriver_name: StringChunkedBuilder::new("deriver_name", 0, 0),
nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
nar_size: PrimitiveChunkedBuilder::new("nar_size", 0),
references: ListBinaryChunkedBuilder::new("references", 0, 0),
signature: BinaryChunkedBuilder::new("signature", 0, 0),
ca_algo: CategoricalChunkedBuilder::new("ca_algo", 0, CategoricalOrdering::Lexical),
ca_hash: BinaryChunkedBuilder::new("ca_hash", 0, 0),
file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
file_size: PrimitiveChunkedBuilder::new("file_size", 0),
compression: CategoricalChunkedBuilder::new(
"compression",
0,
CategoricalOrdering::Lexical,
),
quirk_references_out_of_order: BooleanChunkedBuilder::new(
"quirk_references_out_of_order",
0,
),
quirk_nar_hash_hex: BooleanChunkedBuilder::new("quirk_nar_hash_hex", 0),
}
}
}
impl FrameBuilder {
fn push(&mut self, entry: &NarInfo) {
self.store_path_hash_str
.append_value(nixbase32::encode(entry.store_path.digest()));
self.store_path_hash.append_value(entry.store_path.digest());
self.store_path_name.append_value(entry.store_path.name());
if let Some(deriver) = &entry.deriver {
self.deriver_hash_str
.append_value(nixbase32::encode(deriver.digest()));
self.deriver_hash.append_value(deriver.digest());
self.deriver_name.append_value(deriver.name());
} else {
self.deriver_hash_str.append_null();
self.deriver_hash.append_null();
self.deriver_name.append_null();
}
self.nar_hash.append_value(&entry.nar_hash);
self.nar_size.append_value(entry.nar_size);
self.references
.append_values_iter(entry.references.iter().map(|r| r.digest().as_slice()));
assert!(entry.signatures.len() <= 1);
self.signature
.append_option(entry.signatures.get(0).map(|sig| {
assert_eq!(sig.name(), &"cache.nixos.org-1");
sig.bytes()
}));
if let Some(ca) = &entry.ca {
self.ca_algo.append_value(ca.algo_str());
self.ca_hash.append_value(ca.hash().digest_as_bytes());
} else {
self.ca_algo.append_null();
self.ca_hash.append_null();
}
let file_hash = entry.file_hash.as_ref().unwrap();
let file_size = entry.file_size.unwrap();
self.file_hash.append_value(file_hash);
self.file_size.append_value(file_size);
let (compression, extension) = match entry.compression {
Some("bzip2") => ("bzip2", "bz2"),
Some("xz") => ("xz", "xz"),
Some("zstd") => ("zstd", "zst"),
x => panic!("unknown compression algorithm: {x:?}"),
};
self.compression.append_value(compression);
let mut file_name = nixbase32::encode(file_hash);
file_name.push_str(".nar.");
file_name.push_str(extension);
assert_eq!(entry.url.strip_prefix("nar/").unwrap(), file_name);
{
use narinfo::Flags;
self.quirk_references_out_of_order
.append_value(entry.flags.contains(Flags::REFERENCES_OUT_OF_ORDER));
self.quirk_nar_hash_hex
.append_value(entry.flags.contains(Flags::NAR_HASH_HEX));
let quirks = Flags::REFERENCES_OUT_OF_ORDER | Flags::NAR_HASH_HEX;
let unknown_flags = entry.flags.difference(quirks);
assert!(
unknown_flags.is_empty(),
"rejecting flags: {unknown_flags:?}"
);
}
}
fn finish(mut self) -> DataFrame {
df! {
"store_path_hash_str" => self.store_path_hash_str.finish().into_series(),
"store_path_hash" => self.store_path_hash.finish().into_series(),
"store_path_name" => self.store_path_name.finish().into_series(),
"deriver_hash_str" => self.deriver_hash_str.finish().into_series(),
"deriver_hash" => self.deriver_hash.finish().into_series(),
"deriver_name" => self.deriver_name.finish().into_series(),
"nar_hash" => self.nar_hash.finish().into_series(),
"nar_size" => self.nar_size.finish().into_series(),
"references" => self.references.finish().into_series(),
"signature" => self.signature.finish().into_series(),
"ca_algo" => self.ca_algo.finish().into_series(),
"ca_hash" => self.ca_hash.finish().into_series(),
"file_hash" => self.file_hash.finish().into_series(),
"file_size" => self.file_size.finish().into_series(),
"compression" => self.compression.finish().into_series(),
"quirk_references_out_of_order" => self.quirk_references_out_of_order.finish().into_series(),
"quirk_nar_hash_hex" => self.quirk_nar_hash_hex.finish().into_series()
}
.unwrap()
}
}

5
contrib/refscan/.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
# SPDX-FileCopyrightText: edef <edef@edef.eu>
# SPDX-License-Identifier: CC0-1.0
/target
**/*.rs.bk

7
contrib/refscan/Cargo.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "refscan"
version = "0.1.0"

View file

@ -0,0 +1,3 @@
SPDX-FileCopyrightText: edef <edef@edef.eu>
SPDX-License-Identifier: CC0-1.0

View file

@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: edef <edef@edef.eu>
# SPDX-License-Identifier: MPL-2.0
[package]
name = "refscan"
version = "0.1.0"
authors = ["edef <edef@edef.eu>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View file

@ -0,0 +1,121 @@
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.

View file

@ -0,0 +1,373 @@
Mozilla Public License Version 2.0
==================================
1. Definitions
--------------
1.1. "Contributor"
means each individual or legal entity that creates, contributes to
the creation of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used
by a Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached
the notice in Exhibit A, the Executable Form of such Source Code
Form, and Modifications of such Source Code Form, in each case
including portions thereof.
1.5. "Incompatible With Secondary Licenses"
means
(a) that the initial Contributor has attached the notice described
in Exhibit B to the Covered Software; or
(b) that the Covered Software was made available under the terms of
version 1.1 or earlier of the License, but not also under the
terms of a Secondary License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in
a separate file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible,
whether at the time of the initial grant or subsequently, any and
all of the rights conveyed by this License.
1.10. "Modifications"
means any of the following:
(a) any file in Source Code Form that results from an addition to,
deletion from, or modification of the contents of Covered
Software; or
(b) any new file in Source Code Form that contains any Covered
Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method,
process, and apparatus claims, in any patent Licensable by such
Contributor that would be infringed, but for the grant of the
License, by the making, using, selling, offering for sale, having
made, import, or transfer of either its Contributions or its
Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU
Lesser General Public License, Version 2.1, the GNU Affero General
Public License, Version 3.0, or any later versions of those
licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this
License. For legal entities, "You" includes any entity that
controls, is controlled by, or is under common control with You. For
purposes of this definition, "control" means (a) the power, direct
or indirect, to cause the direction or management of such entity,
whether by contract or otherwise, or (b) ownership of more than
fifty percent (50%) of the outstanding shares or beneficial
ownership of such entity.
2. License Grants and Conditions
--------------------------------
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or
as part of a Larger Work; and
(b) under Patent Claims of such Contributor to make, use, sell, offer
for sale, have made, import, and otherwise transfer either its
Contributions or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:
(a) for any code that a Contributor has removed from Covered Software;
or
(b) for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
(c) under Patent Claims infringed by Covered Software in the absence of
its Contributions.
This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.
3. Responsibilities
-------------------
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
(a) such Covered Software must also be made available in Source Code
Form, as described in Section 3.1, and You must inform recipients of
the Executable Form how they can obtain a copy of such Source Code
Form by reasonable means in a timely manner, at a charge no more
than the cost of distribution to the recipient; and
(b) You may distribute such Executable Form under the terms of this
License, or sublicense it under different terms, provided that the
license for the Executable Form does not attempt to limit or alter
the recipients' rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).
3.4. Notices
You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.
5. Termination
--------------
5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.
************************************************************************
* *
* 6. Disclaimer of Warranty *
* ------------------------- *
* *
* Covered Software is provided under this License on an "as is" *
* basis, without warranty of any kind, either expressed, implied, or *
* statutory, including, without limitation, warranties that the *
* Covered Software is free of defects, merchantable, fit for a *
* particular purpose or non-infringing. The entire risk as to the *
* quality and performance of the Covered Software is with You. *
* Should any Covered Software prove defective in any respect, You *
* (not any Contributor) assume the cost of any necessary servicing, *
* repair, or correction. This disclaimer of warranty constitutes an *
* essential part of this License. No use of any Covered Software is *
* authorized under this License except under this disclaimer. *
* *
************************************************************************
************************************************************************
* *
* 7. Limitation of Liability *
* -------------------------- *
* *
* Under no circumstances and under no legal theory, whether tort *
* (including negligence), contract, or otherwise, shall any *
* Contributor, or anyone who distributes Covered Software as *
* permitted above, be liable to You for any direct, indirect, *
* special, incidental, or consequential damages of any character *
* including, without limitation, damages for lost profits, loss of *
* goodwill, work stoppage, computer failure or malfunction, or any *
* and all other commercial damages or losses, even if such party *
* shall have been informed of the possibility of such damages. This *
* limitation of liability shall not apply to liability for death or *
* personal injury resulting from such party's negligence to the *
* extent applicable law prohibits such limitation. Some *
* jurisdictions do not allow the exclusion or limitation of *
* incidental or consequential damages, so this exclusion and *
* limitation may not apply to You. *
* *
************************************************************************
8. Litigation
-------------
Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.
9. Miscellaneous
----------------
This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.
10. Versions of the License
---------------------------
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
-------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at https://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------
This Source Code Form is "Incompatible With Secondary Licenses", as
defined by the Mozilla Public License, v. 2.0.

154
contrib/refscan/src/lib.rs Normal file
View file

@ -0,0 +1,154 @@
// SPDX-FileCopyrightText: edef <edef@edef.eu>
// SPDX-License-Identifier: MPL-2.0
use self::simd::u8x32;
fn prefilter(haystack: u8x32) -> u32 {
let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
alp | num
}
/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
/// or `Ok(buffer)` if the entire buffer is pointer-free.
pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
let buffer = {
let n = buffer.len() & !31;
&buffer[..n]
};
let mut masks = buffer
.chunks_exact(32)
.map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)))
.enumerate()
.map(|e| (e.0 * 32, e.1))
.peekable();
while let Some((offset, mask)) = masks.next() {
let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
if n >= 32 {
let offset = offset + mask.trailing_zeros() as usize;
return Err(&buffer[..offset]);
}
}
Ok(buffer)
}
#[cfg(test)]
mod test {
#[test]
fn scan_tail() {
let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
}
#[test]
fn scan_straddle() {
let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
}
#[test]
fn scan_clean() {
let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod simd {
#[cfg(target_arch = "x86")]
use std::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as arch;
use {
arch::{__m256i, _mm256_cmpgt_epi8, _mm256_movemask_epi8, _mm256_set1_epi8},
std::ptr,
};
#[allow(non_camel_case_types)]
#[derive(Copy, Clone)]
pub struct u8x32(__m256i);
impl u8x32 {
#[inline(always)]
pub fn from_slice_unaligned(slice: &[u8]) -> Self {
assert_eq!(slice.len(), 32);
u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
}
#[inline(always)]
pub fn splat(x: u8) -> Self {
u8x32(unsafe { _mm256_set1_epi8(x as i8) })
}
#[inline(always)]
pub fn gt(self, b: Self) -> u32 {
unsafe { _mm256_movemask_epi8(_mm256_cmpgt_epi8(self.0, b.0)) as u32 }
}
#[inline(always)]
pub fn lt(self, b: Self) -> u32 {
b.gt(self)
}
}
}
#[cfg(target_arch = "aarch64")]
mod simd {
use std::{
arch::aarch64::{
uint8x16_t as u8x16, vaddv_u8, vandq_u8, vcgtq_u8, vdupq_n_u8, vget_high_u8,
vget_low_u8, vshlq_u8,
},
mem, ptr,
};
#[allow(non_camel_case_types)]
#[derive(Copy, Clone)]
#[repr(transparent)]
pub struct u8x32([u8x16; 2]);
impl u8x32 {
#[cfg(target_endian = "little")]
#[inline(always)]
pub fn from_slice_unaligned(slice: &[u8]) -> Self {
assert_eq!(slice.len(), 32);
u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
}
#[inline(always)]
pub fn splat(x: u8) -> Self {
u8x32(unsafe {
let x = vdupq_n_u8(x);
[x, x]
})
}
#[inline(always)]
pub fn gt(&self, b: Self) -> u32 {
let u8x32([al, ah]) = *self;
let u8x32([bl, bh]) = b;
fn f(a: u8x16, b: u8x16) -> u32 {
unsafe {
let c = vshlq_u8(
vandq_u8(vdupq_n_u8(0x80), vcgtq_u8(a, b)),
mem::transmute([
-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0i8,
]),
);
(vaddv_u8(vget_low_u8(c)) as u32) << 0 | (vaddv_u8(vget_high_u8(c)) as u32) << 8
}
}
f(al, bl) << 0 | f(ah, bh) << 16
}
#[inline(always)]
pub fn lt(self, b: Self) -> u32 {
b.gt(self)
}
}
}

View file

@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: edef <edef@edef.eu>
// SPDX-License-Identifier: MPL-2.0
use std::{
collections::BTreeSet as Set,
convert::TryInto,
io::{self, Read},
str,
};
fn main() {
let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
.lines()
.map(|l| l.as_bytes().try_into().unwrap())
.collect();
let input = {
let stdin = io::stdin();
let mut buffer = Vec::new();
stdin.lock().read_to_end(&mut buffer).unwrap();
buffer
};
let base = input.as_ptr() as usize;
let mut input: &[u8] = &input;
while input.len() >= 32 {
match refscan::scan_clean(&input) {
Ok(buffer) | Err(buffer) => {
let n = buffer.len();
input = &input[n..];
}
}
let buffer = {
let idx = input.iter().position(|x| match x {
b'a'..=b'z' | b'0'..=b'9' => false,
_ => true,
});
idx.map(|idx| &input[..idx]).unwrap_or(input)
};
for chunk in buffer.windows(32) {
let offset = (chunk.as_ptr() as usize) - base;
let chunk = {
let mut fixed = [0u8; 32];
fixed.copy_from_slice(chunk);
fixed
};
if max_refs.contains(&chunk) {
let seen = unsafe { str::from_utf8_unchecked(&chunk) };
println!("{} {}", seen, offset);
}
}
let n = buffer.len();
input = &input[n..];
}
}

6
contrib/refscan/testdata/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
# SPDX-FileCopyrightText: edef <edef@edef.eu>
# SPDX-License-Identifier: CC0-1.0
/maxrefs
/nar
/result

8
contrib/refscan/testdata/generate.sh vendored Executable file
View file

@ -0,0 +1,8 @@
#! /usr/bin/env bash
# SPDX-FileCopyrightText: edef <edef@edef.eu>
# SPDX-License-Identifier: CC0-1.0
set -euo pipefail
drv=$(nix-instantiate '<nixpkgs>' -A ghc)
nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
nix-store --dump "$(nix-build "$drv")" > nar

1779
contrib/turbofetch/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

7145
contrib/turbofetch/Cargo.nix Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,28 @@
[package]
name = "turbofetch"
version = "0.1.0"
edition = "2021"
# We don't join the //snix workspace, as this is fairly cache.nixos.org-specific.
[workspace]
members = ["."]
[dependencies]
aws_lambda_events = { version = "0.11.1", default-features = false, features = ["lambda_function_urls"] }
bytes = "1.5.0"
data-encoding = "2.6.0"
futures = { version = "0.3.30", default-features = false, features = ["std"] }
httparse = "1.8.0"
hyper = { version = "0.14.27", default-features = false }
lambda_runtime = "0.8.2"
magic-buffer = "0.1.1"
rusoto_core = { version = "0.48.0", features = ["rustls"], default-features = false }
rusoto_s3 = { version = "0.48.0", features = ["rustls"], default-features = false }
serde_json = "1.0.108"
serde = { version = "1.0.190", features = ["derive"] }
tokio = { version = "1.33.0", features = ["full"] }
tower = "0.4.13"
# TODO(edef): zstd = "0.13.0"
zstd = "0.9.0"
tracing-subscriber = { version = "0.3.17", features = ["json"] }
tracing = "0.1.40"

View file

@ -0,0 +1 @@
edef

View file

@ -0,0 +1,11 @@
{ pkgs, depot, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
turbofetch = prev: {
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

5
contrib/turbofetch/deploy.sh Executable file
View file

@ -0,0 +1,5 @@
#! /usr/bin/env nix-shell
#! nix-shell -i "bash -e"
#! nix-shell -p cargo-lambda
cargo lambda build --release
cargo lambda deploy

View file

@ -0,0 +1,83 @@
use magic_buffer::MagicBuffer;
use std::cell::Cell;
/// Buffer is a FIFO queue for bytes, built on a ring buffer.
/// It always provides contiguous slices for both the readable and writable parts,
/// using an underlying buffer that is "mirrored" in virtual memory.
pub struct Buffer {
buffer: MagicBuffer,
/// first readable byte
head: Cell<usize>,
/// first writable byte
tail: usize,
}
impl Buffer {
/// Allocate a fresh buffer, with the specified capacity.
/// The buffer can contain at most `capacity - 1` bytes.
/// The capacity must be a power of two, and at least [Buffer::min_len].
pub fn new(capacity: usize) -> Buffer {
Buffer {
// MagicBuffer::new verifies that `capacity` is a power of two,
// and at least MagicBuffer::min_len().
buffer: MagicBuffer::new(capacity).unwrap(),
// `head == tail` means the buffer is empty.
// In order to ensure that this remains unambiguous,
// the buffer can only be filled with capacity-1 bytes.
head: Cell::new(0),
tail: 0,
}
}
/// Returns the minimum buffer capacity.
/// This depends on the operating system and architecture.
pub fn min_capacity() -> usize {
MagicBuffer::min_len()
}
/// Return the capacity of the buffer.
/// This is equal to `self.data().len() + self.space().len() + 1`.
pub fn capacity(&self) -> usize {
self.buffer.len()
}
/// Return the valid, readable data in the buffer.
pub fn data(&self) -> &[u8] {
let len = self.buffer.len();
let head = self.head.get();
if head <= self.tail {
&self.buffer[head..self.tail]
} else {
&self.buffer[head..self.tail + len]
}
}
/// Mark `read_len` bytes of the readable data as consumed, freeing the space.
pub fn consume(&self, read_len: usize) {
debug_assert!(read_len <= self.data().len());
let mut head = self.head.get();
head += read_len;
head &= self.buffer.len() - 1;
self.head.set(head);
}
/// Return the empty, writable space in the buffer.
pub fn space(&mut self) -> &mut [u8] {
let len = self.buffer.len();
let head = self.head.get();
if head <= self.tail {
&mut self.buffer[self.tail..head + len - 1]
} else {
&mut self.buffer[self.tail..head - 1]
}
}
/// Mark `written_len` bytes of the writable space as valid, readable data.
pub fn commit(&mut self, written_len: usize) {
debug_assert!(written_len <= self.space().len());
self.tail += written_len;
self.tail &= self.buffer.len() - 1;
}
}

View file

@ -0,0 +1,103 @@
use std::{mem::MaybeUninit, str};
use tokio::io::{self, AsyncRead, AsyncReadExt};
pub use buffer::Buffer;
mod buffer;
/// Read as much data into `buffer` as possible.
/// Returns [io::ErrorKind::OutOfMemory] if the buffer is already full.
async fn slurp(buffer: &mut Buffer, sock: &mut (impl AsyncRead + Unpin)) -> io::Result<()> {
match buffer.space() {
[] => Err(io::Error::new(io::ErrorKind::OutOfMemory, "buffer filled")),
buf => {
let n = sock.read(buf).await?;
if n == 0 {
return Err(io::ErrorKind::UnexpectedEof.into());
}
buffer.commit(n);
Ok(())
}
}
}
fn get_content_length(headers: &[httparse::Header]) -> io::Result<u64> {
for header in headers {
if header.name == "Transfer-Encoding" {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Transfer-Encoding is unsupported",
));
}
if header.name == "Content-Length" {
return str::from_utf8(header.value)
.ok()
.and_then(|v| v.parse().ok())
.ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Length")
});
}
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Content-Length missing",
))
}
/// Read an HTTP response from `sock` using `buffer`, returning the response body.
/// Returns an error if anything but 200 OK is received.
///
/// The buffer must have enough space to contain the entire response body.
/// If there is not enough space, [io::ErrorKind::OutOfMemory] is returned.
///
/// The HTTP response must use `Content-Length`, without `Transfer-Encoding`.
pub async fn parse_response<'a>(
sock: &mut (impl AsyncRead + Unpin),
buffer: &'a mut Buffer,
) -> io::Result<&'a [u8]> {
let body_len = loop {
let mut headers = [MaybeUninit::uninit(); 16];
let mut response = httparse::Response::new(&mut []);
let status = httparse::ParserConfig::default()
.parse_response_with_uninit_headers(&mut response, buffer.data(), &mut headers)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if let httparse::Status::Complete(n) = status {
buffer.consume(n);
let code = response.code.unwrap();
if code != 200 {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("HTTP response {code}"),
));
}
break get_content_length(response.headers)?;
}
slurp(buffer, sock).await?;
};
let buf_len = buffer.space().len() + buffer.data().len();
if body_len > buf_len as u64 {
return Err(io::Error::new(
io::ErrorKind::OutOfMemory,
"HTTP response body does not fit in buffer",
));
}
let body_len = body_len as usize;
while buffer.data().len() < body_len {
slurp(buffer, sock).await?;
}
let data = buffer.data();
buffer.consume(body_len);
Ok(&data[..body_len])
}

View file

@ -0,0 +1,220 @@
//! turbofetch is a high-performance bulk S3 object aggregator.
//!
//! It operates on two S3 buckets: a source bucket (nix-cache), and a
//! work bucket defined at runtime. The work bucket contains a job file
//! consisting of concatenated 32-character keys, representing narinfo
//! files in the source bucket, without the `.narinfo` suffix or any
//! other separators.
//!
//! Each run of turbofetch processes a half-open range of indices from the
//! job file, and outputs a zstd stream of concatenated objects, without
//! additional separators and in no particular order. These segment files
//! are written into the work bucket, named for the range of indices they
//! cover. `/narinfo.zst/000000000c380d40-000000000c385b60` covers the 20k
//! objects `[0xc380d40, 0xc385b60) = [205000000, 205020000)`. Empirically,
//! segment files of 20k objects achieve a compression ratio of 4.7x.
//!
//! Reassembly is left to narinfo2parquet, which interprets StorePath lines.
//!
//! TODO(edef): any retries/error handling whatsoever
//! Currently, it fails an entire range if anything goes wrong, and doesn't
//! write any output.
use bytes::Bytes;
use futures::{stream::FuturesUnordered, Stream, TryStreamExt};
use rusoto_core::ByteStream;
use rusoto_s3::{GetObjectRequest, PutObjectRequest, S3Client, S3};
use serde::Deserialize;
use std::{io::Write, mem, ops::Range, ptr};
use tokio::{
io::{self, AsyncReadExt, AsyncWriteExt},
net::TcpStream,
};
/// Fetch a group of keys, streaming concatenated chunks as they arrive from S3.
/// `keys` must be a slice from the job file. Any network error at all fails the
/// entire batch, and there is no rate limiting.
fn fetch(keys: &[[u8; 32]]) -> impl Stream<Item = io::Result<Bytes>> {
// S3 supports only HTTP/1.1, but we can ease the pain somewhat by using
// HTTP pipelining. It terminates the TCP connection after receiving 100
// requests, so we chunk the keys up accordingly, and make one connection
// for each chunk.
keys.chunks(100)
.map(|chunk| {
const PREFIX: &[u8] = b"GET /nix-cache/";
const SUFFIX: &[u8] = b".narinfo HTTP/1.1\nHost: s3.amazonaws.com\n\n";
const LENGTH: usize = PREFIX.len() + 32 + SUFFIX.len();
let mut request = Vec::with_capacity(LENGTH * 100);
for key in chunk {
request.extend_from_slice(PREFIX);
request.extend_from_slice(key);
request.extend_from_slice(SUFFIX);
}
(request, chunk.len())
})
.map(|(request, n)| async move {
let (mut read, mut write) = TcpStream::connect("s3.amazonaws.com:80")
.await?
.into_split();
let _handle = tokio::spawn(async move {
let request = request;
write.write_all(&request).await
});
let mut buffer = turbofetch::Buffer::new(512 * 1024);
let mut bodies = vec![];
for _ in 0..n {
let body = turbofetch::parse_response(&mut read, &mut buffer).await?;
bodies.extend_from_slice(body);
}
Ok::<_, io::Error>(Bytes::from(bodies))
})
.collect::<FuturesUnordered<_>>()
}
/// Retrieve a range of keys from the job file.
async fn get_range(
s3: &'static S3Client,
bucket: String,
key: String,
range: Range<u64>,
) -> io::Result<Box<[[u8; 32]]>> {
let resp = s3
.get_object(GetObjectRequest {
bucket,
key,
range: Some(format!("bytes={}-{}", range.start * 32, range.end * 32 - 1)),
..GetObjectRequest::default()
})
.await
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let mut body = vec![];
resp.body
.ok_or(io::ErrorKind::InvalidData)?
.into_async_read()
.read_to_end(&mut body)
.await?;
let body = exact_chunks(body.into_boxed_slice()).ok_or(io::ErrorKind::InvalidData)?;
Ok(body)
}
fn exact_chunks(mut buf: Box<[u8]>) -> Option<Box<[[u8; 32]]>> {
// SAFETY: We ensure that `buf.len()` is a multiple of 32, and there are no alignment requirements.
unsafe {
let ptr = buf.as_mut_ptr();
let len = buf.len();
if len % 32 != 0 {
return None;
}
let ptr = ptr as *mut [u8; 32];
let len = len / 32;
mem::forget(buf);
Some(Box::from_raw(ptr::slice_from_raw_parts_mut(ptr, len)))
}
}
// TODO(edef): factor this out into a separate entry point
#[tokio::main(flavor = "current_thread")]
async fn main() -> Result<(), lambda_runtime::Error> {
let s3 = S3Client::new(rusoto_core::Region::UsEast1);
let s3 = &*Box::leak(Box::new(s3));
tracing_subscriber::fmt()
.json()
.with_max_level(tracing::Level::INFO)
// this needs to be set to remove duplicated information in the log.
.with_current_span(false)
// this needs to be set to false, otherwise ANSI color codes will
// show up in a confusing manner in CloudWatch logs.
.with_ansi(false)
// disabling time is handy because CloudWatch will add the ingestion time.
.without_time()
// remove the name of the function from every log entry
.with_target(false)
.init();
lambda_runtime::run(lambda_runtime::service_fn(|event| func(s3, event))).await
}
/// Lambda request body
#[derive(Debug, Deserialize)]
struct Params {
work_bucket: String,
job_file: String,
start: u64,
end: u64,
}
#[tracing::instrument(skip(s3, event), fields(req_id = %event.context.request_id))]
async fn func(
s3: &'static S3Client,
event: lambda_runtime::LambdaEvent<
aws_lambda_events::lambda_function_urls::LambdaFunctionUrlRequest,
>,
) -> Result<&'static str, lambda_runtime::Error> {
let mut params = event.payload.body.ok_or("no body")?;
if event.payload.is_base64_encoded {
params = String::from_utf8(data_encoding::BASE64.decode(params.as_bytes())?)?;
}
let params: Params = serde_json::from_str(&params)?;
if params.start >= params.end {
return Err("nope".into());
}
let keys = get_range(
s3,
params.work_bucket.clone(),
params.job_file.to_owned(),
params.start..params.end,
)
.await?;
let zchunks = fetch(&keys)
.try_fold(
Box::new(zstd::Encoder::new(vec![], zstd::DEFAULT_COMPRESSION_LEVEL).unwrap()),
|mut w, buf| {
w.write_all(&buf).unwrap();
async { Ok(w) }
},
)
.await?;
let zchunks = to_byte_stream(zchunks.finish().unwrap());
tracing::info!("we got to put_object");
s3.put_object(PutObjectRequest {
bucket: params.work_bucket,
key: format!("narinfo.zst/{:016x}-{:016x}", params.start, params.end),
body: Some(zchunks),
..Default::default()
})
.await
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
tracing::info!("… and it worked!");
Ok("OK")
}
fn to_byte_stream(buffer: Vec<u8>) -> ByteStream {
let size_hint = buffer.len();
ByteStream::new_with_size(
futures::stream::once(async { Ok(buffer.into()) }),
size_hint,
)
}

2511
contrib/weave/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

9641
contrib/weave/Cargo.nix Normal file

File diff suppressed because it is too large Load diff

23
contrib/weave/Cargo.toml Normal file
View file

@ -0,0 +1,23 @@
[package]
name = "weave"
version = "0.1.0"
edition = "2021"
[workspace]
members = ["."]
# TODO(edef): cut down on required features, this is kind of a grab bag right now
[dependencies]
anyhow = { version = "1.0.79", features = ["backtrace"] }
hashbrown = "0.14.3"
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
safer_owning_ref = "0.5.0"
rayon = "1.8.1"
rustc-hash = "2.0.0"
snix-tracing = { version = "0.1.0", path = "../../snix/tracing" }
tracing = "0.1.40"
tracing-indicatif = "0.3.6"
[dependencies.polars]
version = "0.36.2"
features = ["parquet", "lazy", "streaming"]

1
contrib/weave/OWNERS Normal file
View file

@ -0,0 +1 @@
edef

11
contrib/weave/default.nix Normal file
View file

@ -0,0 +1,11 @@
{ pkgs, depot, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
weave = prev: {
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

View file

@ -0,0 +1,118 @@
//! Swizzle reads a `narinfo.parquet` file, usually produced by `narinfo2parquet`.
//!
//! It swizzles the reference list, ie it converts the references from absolute,
//! global identifiers (store path hashes) to indices into the `store_path_hash`
//! column (ie, row numbers), so that we can later walk the reference graph
//! efficiently.
//!
//! Path hashes are represented as non-null, 20-byte `Binary` values.
//! The indices are represented as 32-bit unsigned integers, with in-band nulls
//! represented by [INDEX_NULL] (the all-1 bit pattern), to permit swizzling
//! partial datasets.
//!
//! In essence, it converts from names to pointers, so that `weave` can simply
//! chase pointers to trace the live set. This replaces an `O(log(n))` lookup
//! with `O(1)` indexing, and produces a much denser representation that actually
//! fits in memory.
//!
//! The in-memory representation is at least 80% smaller, and the indices compress
//! well in Parquet due to both temporal locality of reference and the power law
//! distribution of reference "popularity".
//!
//! Only two columns are read from `narinfo.parquet`:
//!
//! * `store_path_hash :: PathHash`
//! * `references :: List[PathHash]`
//!
//! Output is written to `narinfo-references.parquet` in the form of a single
//! `List[u32]` column, `reference_idxs`.
//!
//! This file is inherently bound to the corresponding `narinfo.parquet`,
//! since it essentially contains pointers into this file.
use anyhow::Result;
use hashbrown::HashTable;
use polars::{
lazy::dsl::{col, SpecialEq},
prelude::*,
};
use tracing::info_span;
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
use weave::{as_fixed_binary, hash64, leak, load_ph_array, INDEX_NULL};
#[tracing::instrument]
fn main() -> Result<()> {
let _tracing = snix_tracing::TracingBuilder::default()
.enable_progressbar()
.build()?;
let ph_array: &'static [[u8; 20]] = leak(load_ph_array()?);
// TODO(edef): re-parallelise this
// We originally parallelised on chunks, but ph_array is only a single chunk, due to how Parquet loading works.
// TODO(edef): outline the 64-bit hash prefix? it's an indirection, but it saves ~2G of memory
let ph_map: &'static HashTable<(u64, u32)> = {
let span = info_span!("ph_map", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("build index");
span.pb_start();
let mut ph_map = HashTable::with_capacity(ph_array.len());
for (offset, item) in ph_array.iter().enumerate() {
let offset = offset as u32;
let hash = hash64(item);
ph_map.insert_unique(hash, (hash, offset), |&(hash, _)| hash);
}
&*Box::leak(Box::new(ph_map))
};
let ph_to_idx = |key: &[u8; 20]| -> u32 {
let hash = hash64(key);
ph_map
.find(hash, |&(candidate_hash, candidate_index)| {
candidate_hash == hash && &ph_array[candidate_index as usize] == key
})
.map(|&(_, index)| index)
.unwrap_or(INDEX_NULL)
};
{
let span = info_span!("swizzle_refs", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("swizzle references");
span.pb_start();
LazyFrame::scan_parquet("narinfo.parquet", ScanArgsParquet::default())?
.with_column(
col("references")
.map(
move |series: Series| -> PolarsResult<Option<Series>> {
Ok(Some(
series
.list()?
.apply_to_inner(&|series: Series| -> PolarsResult<Series> {
let series = series.binary()?;
let mut out: Vec<u32> = Vec::with_capacity(series.len());
out.extend(
as_fixed_binary(series).flatten().map(ph_to_idx),
);
Ok(Series::from_vec("reference_idxs", out))
})?
.into_series(),
))
},
SpecialEq::from_type(DataType::List(DataType::UInt32.into())),
)
.alias("reference_idxs"),
)
.select([col("reference_idxs")])
.with_streaming(true)
.sink_parquet(
"narinfo-references.parquet".into(),
ParquetWriteOptions::default(),
)?;
};
Ok(())
}

133
contrib/weave/src/lib.rs Normal file
View file

@ -0,0 +1,133 @@
use anyhow::Result;
use owning_ref::{ArcRef, OwningRef};
use rayon::prelude::*;
use std::{
fs::File,
mem,
ops::{Deref, Range},
slice,
sync::Arc,
};
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
use polars::{
datatypes::BinaryChunked,
export::arrow::array::BinaryArray,
prelude::{ParquetReader, SerReader},
};
/// An shared `[[u8; N]]` backed by a Polars [Buffer].
pub type FixedBytes<const N: usize> =
ArcRef<'static, polars::export::arrow::buffer::Bytes<u8>, [[u8; N]]>;
pub const INDEX_NULL: u32 = !0;
/// A terrific hash function, turning 20 bytes of cryptographic hash
/// into 8 bytes of cryptographic hash.
pub fn hash64(h: &[u8; 20]) -> u64 {
let mut buf = [0; 8];
buf.copy_from_slice(&h[..8]);
u64::from_ne_bytes(buf)
}
pub fn leak<O, T: ?Sized>(r: OwningRef<Arc<O>, T>) -> &T {
// SAFETY: Either `ptr` points into the `Arc`, which lives until `r` is dropped,
// or it points at something else entirely which lives at least as long.
unsafe {
let ptr: *const T = r.deref();
mem::forget(r);
&*ptr
}
}
/// Read a dense `store_path_hash` array from `narinfo.parquet`,
/// returning it as an owned [FixedBytes].
#[tracing::instrument(fields(indicatif.pb_show = tracing::field::Empty))]
pub fn load_ph_array() -> Result<FixedBytes<20>> {
let span = tracing::Span::current();
span.pb_set_message("load store_path_hash");
span.pb_start();
// TODO(edef): this could use a further pushdown, since polars is more hindrance than help here
// We know this has to fit in memory (we can't mmap it without further encoding constraints),
// and we want a single `Vec<[u8; 20]>` of the data.
let ph_array = into_fixed_binary_rechunk::<20>(
ParquetReader::new(File::open("narinfo.parquet").unwrap())
.with_columns(Some(vec!["store_path_hash".into()]))
.set_rechunk(true)
.finish()?
.column("store_path_hash")?
.binary()?,
);
u32::try_from(ph_array.len()).expect("dataset exceeds 2^32");
Ok(ph_array)
}
/// Iterator over `&[[u8; N]]` from a dense [BinaryChunked].
pub fn as_fixed_binary<const N: usize>(
chunked: &BinaryChunked,
) -> impl DoubleEndedIterator<Item = &[[u8; N]]> {
chunked.downcast_iter().map(|array| {
let range = assert_fixed_dense::<N>(array);
exact_chunks(&array.values()[range]).unwrap()
})
}
/// Convert a dense [BinaryChunked] into a single chunk as [FixedBytes],
/// without taking a reference to the offsets array and validity bitmap.
fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBytes<N> {
let chunked = chunked.rechunk();
let mut iter = chunked.downcast_iter();
let array = iter.next().unwrap();
assert!(iter.next().is_none());
let (buf, off, len) = {
let range = assert_fixed_dense::<N>(array);
array.values().clone().sliced(range.start, range.len())
}
.into_inner();
ArcRef::new(buf).map(|bytes| exact_chunks(&bytes[off..off + len]).unwrap())
}
/// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
/// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
#[must_use = "only the range returned is guaranteed to be conformant"]
fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
if null_count > 0 {
panic!("null values present");
}
let offsets = array.offsets();
let length_check = offsets
.as_slice()
.par_windows(2)
.all(|w| (w[1] - w[0]) == N as i64);
if !length_check {
panic!("lengths are inconsistent");
}
(*offsets.first() as usize)..(*offsets.last() as usize)
}
fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {
// SAFETY: We ensure that `buf.len()` is a multiple of K, and there are no alignment requirements.
unsafe {
let ptr = buf.as_ptr();
let len = buf.len();
if len % K != 0 {
return None;
}
let ptr = ptr as *mut [u8; K];
let len = len / K;
Some(slice::from_raw_parts(ptr, len))
}
}

262
contrib/weave/src/main.rs Normal file
View file

@ -0,0 +1,262 @@
//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
//! produced by `swizzle` to collect the closure of the roots.
//!
//! They are written to `live_idxs.parquet`, which only has one column, representing
//! the row numbers in `narinfo.parquet` corresponding to live paths.
use anyhow::Result;
use hashbrown::{hash_table, HashTable};
use rayon::prelude::*;
use rustc_hash::FxHashSet;
use std::{
collections::BTreeMap,
fs::File,
ops::Index,
sync::atomic::{AtomicU32, Ordering},
};
use tracing::{info_span, warn};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use polars::{
datatypes::StaticArray,
export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
lazy::dsl::col,
prelude::*,
};
use weave::{as_fixed_binary, hash64, INDEX_NULL};
#[tracing::instrument]
fn main() -> Result<()> {
let _tracing = snix_tracing::TracingBuilder::default()
.enable_progressbar()
.build()?;
let roots: PathSet32 = {
let span = info_span!("parse_roots", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("parse roots");
span.pb_start();
as_fixed_binary::<20>(
LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
.explode([col("store_path_hash")])
.select([col("store_path_hash")])
.collect()?
.column("store_path_hash")?
.binary()?,
)
.flatten()
.collect()
};
{
let span = info_span!("resolve_roots", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("resolve roots");
span.pb_start();
weave::load_ph_array()?
.into_par_iter()
.enumerate()
.for_each(|(idx, h)| {
if let Some(idx_slot) = roots.find(h) {
assert_eq!(
idx_slot.swap(idx as u32, Ordering::Relaxed),
INDEX_NULL,
"duplicate entry"
);
}
});
}
let mut todo = FxHashSet::default();
todo.reserve(roots.len());
{
let mut unknown_roots = 0usize;
for (_, idx) in roots.table {
let idx = idx.into_inner();
if idx == INDEX_NULL {
unknown_roots += 1;
continue;
}
todo.insert(idx);
}
if unknown_roots != 0 {
warn!("skipping {unknown_roots} unknown roots");
}
}
let ri_array;
let ri_array = {
let span = info_span!(
"load_reference_idxs",
indicatif.pb_show = tracing::field::Empty
)
.entered();
span.pb_set_message("load reference_idxs");
span.pb_start();
ri_array = ParquetReader::new(File::open("narinfo-references.parquet")?)
.finish()?
.column("reference_idxs")?
.list()?
.clone();
ChunkedList::new(ri_array.downcast_iter().map(|chunk| {
(
chunk.offsets(),
chunk
.values()
.as_any()
.downcast_ref::<UInt32Array>()
.unwrap()
.as_slice()
.unwrap(),
)
}))
};
let mut seen = todo.clone();
{
let span = info_span!("mark", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("marking");
span.pb_set_style(&snix_tracing::PB_PROGRESS_STYLE);
while !todo.is_empty() {
span.pb_set_length(seen.len() as u64);
span.pb_set_position(seen.len().saturating_sub(todo.len()) as u64);
todo = todo
.par_iter()
.flat_map(|&parent| {
if parent == INDEX_NULL {
return FxHashSet::default();
}
ri_array[parent as usize]
.iter()
.cloned()
.filter(|child| !seen.contains(child))
.collect::<FxHashSet<u32>>()
})
.collect();
for &index in &todo {
seen.insert(index);
}
}
span.pb_set_length(seen.len() as u64);
span.pb_set_position(seen.len() as u64);
if seen.remove(&INDEX_NULL) {
warn!("WARNING: missing edges");
}
}
let seen = {
let span = info_span!("gather_live", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("gathering live set");
let mut seen: Vec<u32> = seen.into_iter().collect();
seen.par_sort();
seen
};
{
let span = info_span!("write_output", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("writing output");
span.pb_start();
ParquetWriter::new(File::create("live_idxs.parquet")?).finish(&mut df! {
"live_idx" => seen,
}?)?;
}
Ok(())
}
struct PathSet32 {
table: HashTable<([u8; 20], AtomicU32)>,
}
impl PathSet32 {
fn with_capacity(capacity: usize) -> Self {
Self {
table: HashTable::with_capacity(capacity),
}
}
fn insert(&mut self, value: &[u8; 20]) -> bool {
let hash = hash64(value);
match self
.table
.entry(hash, |(x, _)| x == value, |(x, _)| hash64(x))
{
hash_table::Entry::Occupied(_) => false,
hash_table::Entry::Vacant(entry) => {
entry.insert((*value, AtomicU32::new(INDEX_NULL)));
true
}
}
}
fn find(&self, value: &[u8; 20]) -> Option<&AtomicU32> {
let hash = hash64(value);
self.table
.find(hash, |(x, _)| x == value)
.as_ref()
.map(|(_, x)| x)
}
fn len(&self) -> usize {
self.table.len()
}
}
impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
fn from_iter<T: IntoIterator<Item = &'a [u8; 20]>>(iter: T) -> Self {
let iter = iter.into_iter();
let mut this = Self::with_capacity(iter.size_hint().0);
for item in iter {
this.insert(item);
}
this.table.shrink_to_fit(|(x, _)| hash64(x));
this
}
}
struct ChunkedList<'a, T> {
by_offset: BTreeMap<usize, (&'a OffsetsBuffer<i64>, &'a [T])>,
}
impl<'a, T> ChunkedList<'a, T> {
fn new(chunks: impl IntoIterator<Item = (&'a OffsetsBuffer<i64>, &'a [T])>) -> Self {
let mut next_offset = 0usize;
ChunkedList {
by_offset: chunks
.into_iter()
.map(|(offsets, values)| {
let offset = next_offset;
next_offset = next_offset.checked_add(offsets.len_proxy()).unwrap();
(offset, (offsets, values))
})
.collect(),
}
}
}
impl<'a, T> Index<usize> for ChunkedList<'a, T> {
type Output = [T];
fn index(&self, index: usize) -> &Self::Output {
let (&base, &(offsets, values)) = self.by_offset.range(..=index).next_back().unwrap();
let (start, end) = offsets.start_end(index - base);
&values[start..end]
}
}