chore(users/edef): move to contrib
Change-Id: I1a6972fab8ada26917f29607fc401e376d634070
This commit is contained in:
parent
a7916624dc
commit
403d8fc897
55 changed files with 15 additions and 17 deletions
1
contrib/.gitignore
vendored
Normal file
1
contrib/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
target
|
||||
3
contrib/OWNERS
Normal file
3
contrib/OWNERS
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
set noparent
|
||||
|
||||
edef
|
||||
1
contrib/crunch-v2/.gitignore
vendored
Normal file
1
contrib/crunch-v2/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
*.parquet
|
||||
3193
contrib/crunch-v2/Cargo.lock
generated
Normal file
3193
contrib/crunch-v2/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
11745
contrib/crunch-v2/Cargo.nix
Normal file
11745
contrib/crunch-v2/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
39
contrib/crunch-v2/Cargo.toml
Normal file
39
contrib/crunch-v2/Cargo.toml
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
[package]
|
||||
name = "crunch-v2"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.75", features = ["backtrace"] }
|
||||
lazy_static = "1.4.0"
|
||||
|
||||
bstr = "1.8.0"
|
||||
bytes = "1.6.1"
|
||||
|
||||
futures = "0.3.29"
|
||||
tokio = { version = "1.37.0", features = ["full"] }
|
||||
|
||||
rusoto_core = { version = "0.48.0", default-features = false, features = ["hyper-rustls"] }
|
||||
rusoto_s3 = { version = "0.48.0", default-features = false, features = ["rustls"] }
|
||||
|
||||
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
|
||||
sled = "0.34.7"
|
||||
|
||||
fastcdc = "3.1.0"
|
||||
blake3 = "1.5.0"
|
||||
sha2 = { version = "0.10.8", features = ["asm"] }
|
||||
digest = "0.10.7"
|
||||
|
||||
bzip2 = "0.4.4"
|
||||
xz2 = "0.1.7"
|
||||
zstd = "0.13.0"
|
||||
prost = "0.12.2"
|
||||
polars = { version = "0.35.4", default-features = false, features = ["parquet", "lazy", "sql", "dtype-struct"] }
|
||||
indicatif = "0.17.7"
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
|
||||
[build-dependencies]
|
||||
prost-build = "0.12.2"
|
||||
1
contrib/crunch-v2/OWNERS
Normal file
1
contrib/crunch-v2/OWNERS
Normal file
|
|
@ -0,0 +1 @@
|
|||
edef
|
||||
6
contrib/crunch-v2/build.rs
Normal file
6
contrib/crunch-v2/build.rs
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
use std::io::Result;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
prost_build::compile_protos(&["protos/flatstore.proto"], &["protos/"])?;
|
||||
Ok(())
|
||||
}
|
||||
15
contrib/crunch-v2/default.nix
Normal file
15
contrib/crunch-v2/default.nix
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
{ pkgs, depot, lib, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
crunch-v2 = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc rec {
|
||||
root = prev.src.origSrc;
|
||||
extraFileset = lib.fileset.fileFilter (f: f.hasExt "proto") root;
|
||||
};
|
||||
nativeBuildInputs = [ pkgs.protobuf ];
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
38
contrib/crunch-v2/protos/flatstore.proto
Normal file
38
contrib/crunch-v2/protos/flatstore.proto
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
syntax = "proto3";
|
||||
|
||||
package snix.flatstore.v1;
|
||||
|
||||
message Path {
|
||||
bytes nar_hash = 1;
|
||||
|
||||
oneof node {
|
||||
DirectoryNode directory = 2;
|
||||
FileNode file = 3;
|
||||
SymlinkNode symlink = 4;
|
||||
}
|
||||
}
|
||||
|
||||
message DirectoryNode {
|
||||
bytes name = 1;
|
||||
repeated DirectoryNode directories = 2;
|
||||
repeated FileNode files = 3;
|
||||
repeated SymlinkNode symlinks = 4;
|
||||
}
|
||||
|
||||
message FileNode {
|
||||
bytes name = 1;
|
||||
bytes hash = 2;
|
||||
repeated Chunk chunks = 3;
|
||||
bool executable = 4;
|
||||
}
|
||||
|
||||
message Chunk {
|
||||
bytes hash = 1;
|
||||
uint32 size = 2;
|
||||
uint32 size_compressed = 3;
|
||||
}
|
||||
|
||||
message SymlinkNode {
|
||||
bytes name = 1;
|
||||
bytes target = 2;
|
||||
}
|
||||
155
contrib/crunch-v2/src/bin/extract.rs
Normal file
155
contrib/crunch-v2/src/bin/extract.rs
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
//! This tool lossily converts a Sled database produced by crunch-v2 into a Parquet file for analysis.
|
||||
//! The resulting `crunch.parquet` has columns file_hash`, `nar_hash`, and `chunk`.
|
||||
//! The first two are SHA-256 hashes of the compressed file and the NAR it decompresses to.
|
||||
//! `chunk` is a struct array corresponding to [crunch_v2::proto::Chunk] messages.
|
||||
//! They are concatenated without any additional structure, so nothing but the chunk list is preserved.
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crunch_v2::proto::{self, path::Node};
|
||||
use prost::Message;
|
||||
|
||||
use polars::{
|
||||
chunked_array::builder::AnonymousOwnedListBuilder,
|
||||
prelude::{
|
||||
df, BinaryChunkedBuilder, ChunkedBuilder, DataFrame, DataType, Field, ListBuilderTrait,
|
||||
NamedFrom, ParquetWriter, PrimitiveChunkedBuilder, Series, UInt32Type,
|
||||
},
|
||||
series::IntoSeries,
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Args {
|
||||
/// Path to the sled database that's read from.
|
||||
#[clap(default_value = "crunch.db")]
|
||||
infile: PathBuf,
|
||||
|
||||
/// Path to the resulting parquet file that's written.
|
||||
#[clap(default_value = "crunch.parquet")]
|
||||
outfile: PathBuf,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
let w = ParquetWriter::new(File::create(args.outfile)?);
|
||||
|
||||
let db: sled::Db = sled::open(&args.infile).unwrap();
|
||||
let files_tree: sled::Tree = db.open_tree("files").unwrap();
|
||||
|
||||
let progress =
|
||||
ProgressBar::new(files_tree.len() as u64).with_style(ProgressStyle::with_template(
|
||||
"{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
|
||||
)?);
|
||||
|
||||
let mut frame = FrameBuilder::new();
|
||||
for entry in &files_tree {
|
||||
let (file_hash, pb) = entry?;
|
||||
frame.push(
|
||||
file_hash[..].try_into().unwrap(),
|
||||
proto::Path::decode(&pb[..])?,
|
||||
);
|
||||
progress.inc(1);
|
||||
}
|
||||
|
||||
w.finish(&mut frame.finish())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct FrameBuilder {
|
||||
file_hash: BinaryChunkedBuilder,
|
||||
nar_hash: BinaryChunkedBuilder,
|
||||
chunk: AnonymousOwnedListBuilder,
|
||||
}
|
||||
|
||||
impl FrameBuilder {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
|
||||
nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
|
||||
chunk: AnonymousOwnedListBuilder::new(
|
||||
"chunk",
|
||||
0,
|
||||
Some(DataType::Struct(vec![
|
||||
Field::new("hash", DataType::Binary),
|
||||
Field::new("size", DataType::UInt32),
|
||||
Field::new("size_compressed", DataType::UInt32),
|
||||
])),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, file_hash: [u8; 32], pb: proto::Path) {
|
||||
self.file_hash.append_value(&file_hash[..]);
|
||||
self.nar_hash.append_value(pb.nar_hash);
|
||||
self.chunk
|
||||
.append_series(&ChunkFrameBuilder::new(pb.node.unwrap()))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn finish(mut self) -> DataFrame {
|
||||
df! {
|
||||
"file_hash" => self.file_hash.finish().into_series(),
|
||||
"nar_hash" => self.nar_hash.finish().into_series(),
|
||||
"chunk" => self.chunk.finish().into_series()
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkFrameBuilder {
|
||||
hash: BinaryChunkedBuilder,
|
||||
size: PrimitiveChunkedBuilder<UInt32Type>,
|
||||
size_compressed: PrimitiveChunkedBuilder<UInt32Type>,
|
||||
}
|
||||
|
||||
impl ChunkFrameBuilder {
|
||||
fn new(node: proto::path::Node) -> Series {
|
||||
let mut this = Self {
|
||||
hash: BinaryChunkedBuilder::new("hash", 0, 0),
|
||||
size: PrimitiveChunkedBuilder::new("size", 0),
|
||||
size_compressed: PrimitiveChunkedBuilder::new("size_compressed", 0),
|
||||
};
|
||||
|
||||
this.push(node);
|
||||
this.finish()
|
||||
}
|
||||
|
||||
fn push(&mut self, node: Node) {
|
||||
match node {
|
||||
Node::Directory(node) => {
|
||||
for node in node.files {
|
||||
self.push(Node::File(node));
|
||||
}
|
||||
|
||||
for node in node.directories {
|
||||
self.push(Node::Directory(node));
|
||||
}
|
||||
}
|
||||
Node::File(node) => {
|
||||
for chunk in node.chunks {
|
||||
self.hash.append_value(&chunk.hash);
|
||||
self.size.append_value(chunk.size);
|
||||
self.size_compressed.append_value(chunk.size_compressed);
|
||||
}
|
||||
}
|
||||
Node::Symlink(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn finish(self) -> Series {
|
||||
df! {
|
||||
"hash" => self.hash.finish().into_series(),
|
||||
"size" => self.size.finish().into_series(),
|
||||
"size_compressed" => self.size_compressed.finish().into_series()
|
||||
}
|
||||
.unwrap()
|
||||
.into_struct("chunk")
|
||||
.into_series()
|
||||
}
|
||||
}
|
||||
3
contrib/crunch-v2/src/lib.rs
Normal file
3
contrib/crunch-v2/src/lib.rs
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
pub mod proto {
|
||||
include!(concat!(env!("OUT_DIR"), "/snix.flatstore.v1.rs"));
|
||||
}
|
||||
309
contrib/crunch-v2/src/main.rs
Normal file
309
contrib/crunch-v2/src/main.rs
Normal file
|
|
@ -0,0 +1,309 @@
|
|||
//! This is a tool for ingesting subsets of cache.nixos.org into its own flattened castore format.
|
||||
//! Currently, produced chunks are not preserved, and this purely serves as a way of measuring
|
||||
//! compression/deduplication ratios for various chunking and compression parameters.
|
||||
//!
|
||||
//! NARs to be ingested are read from `ingest.parquet`, and filtered by an SQL expression provided as a program argument.
|
||||
//! The `file_hash` column should contain SHA-256 hashes of the compressed data, corresponding to the `FileHash` narinfo field.
|
||||
//! The `compression` column should contain either `"bzip2"` or `"xz"`, corresponding to the `Compression` narinfo field.
|
||||
//! Additional columns are ignored, but can be used by the SQL filter expression.
|
||||
//!
|
||||
//! flatstore protobufs are written to a sled database named `crunch.db`, addressed by file hash.
|
||||
|
||||
use crunch_v2::proto;
|
||||
|
||||
mod remote;
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use std::{
|
||||
io::{self, BufRead, Read, Write},
|
||||
path::PathBuf,
|
||||
ptr,
|
||||
};
|
||||
|
||||
use polars::{
|
||||
prelude::{col, LazyFrame, ScanArgsParquet},
|
||||
sql::sql_expr,
|
||||
};
|
||||
|
||||
use fastcdc::v2020::{ChunkData, StreamCDC};
|
||||
use nix_compat::nar::reader as nar;
|
||||
|
||||
use digest::Digest;
|
||||
use prost::Message;
|
||||
use sha2::Sha256;
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Args {
|
||||
/// Path to an existing parquet file.
|
||||
/// The `file_hash` column should contain SHA-256 hashes of the compressed
|
||||
/// data, corresponding to the `FileHash` narinfo field.
|
||||
/// The `compression` column should contain either `"bzip2"` or `"xz"`,
|
||||
/// corresponding to the `Compression` narinfo field.
|
||||
/// Additional columns are ignored, but can be used by the SQL filter expression.
|
||||
#[clap(long, default_value = "ingest.parquet")]
|
||||
infile: PathBuf,
|
||||
|
||||
/// Filter expression to filter elements in the parquet file for.
|
||||
filter: String,
|
||||
|
||||
/// Average chunk size for FastCDC, in KiB.
|
||||
/// min value is half, max value double of that number.
|
||||
#[clap(long, default_value_t = 256)]
|
||||
avg_chunk_size: u32,
|
||||
|
||||
/// Path to the sled database where results are written to (flatstore
|
||||
/// protobufs, addressed by file hash).
|
||||
#[clap(long, default_value = "crunch.db")]
|
||||
outfile: PathBuf,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
let filter = sql_expr(args.filter)?;
|
||||
let avg_chunk_size = args.avg_chunk_size * 1024;
|
||||
|
||||
let df = LazyFrame::scan_parquet(&args.infile, ScanArgsParquet::default())?
|
||||
.filter(filter)
|
||||
.select([col("file_hash"), col("compression")])
|
||||
.drop_nulls(None)
|
||||
.collect()?;
|
||||
|
||||
let progress = ProgressBar::new(df.height() as u64).with_style(ProgressStyle::with_template(
|
||||
"{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
|
||||
)?);
|
||||
|
||||
let file_hash = df
|
||||
.column("file_hash")?
|
||||
.binary()?
|
||||
.into_iter()
|
||||
.map(|h| -> [u8; 32] { h.unwrap().try_into().unwrap() });
|
||||
|
||||
let compression = df
|
||||
.column("compression")?
|
||||
.utf8()?
|
||||
.into_iter()
|
||||
.map(|c| c.unwrap());
|
||||
|
||||
let db: sled::Db = sled::open(args.outfile).unwrap();
|
||||
let files_tree = db.open_tree("files").unwrap();
|
||||
|
||||
let res = stream::iter(file_hash.zip(compression))
|
||||
.map(Ok)
|
||||
.try_for_each_concurrent(Some(16), |(file_hash, compression)| {
|
||||
let progress = progress.clone();
|
||||
let files_tree = files_tree.clone();
|
||||
async move {
|
||||
if files_tree.contains_key(&file_hash)? {
|
||||
progress.inc(1);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let reader = remote::nar(file_hash, compression).await?;
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let mut reader = Sha256Reader::from(reader);
|
||||
|
||||
let path =
|
||||
ingest(nar::open(&mut reader)?, vec![], avg_chunk_size).map(|node| {
|
||||
proto::Path {
|
||||
nar_hash: reader.finalize().as_slice().into(),
|
||||
node: Some(node),
|
||||
}
|
||||
})?;
|
||||
|
||||
files_tree.insert(file_hash, path.encode_to_vec())?;
|
||||
progress.inc(1);
|
||||
|
||||
Ok::<_, anyhow::Error>(())
|
||||
})
|
||||
.await?
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
let flush = files_tree.flush_async().await;
|
||||
|
||||
res?;
|
||||
flush?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ingest(node: nar::Node, name: Vec<u8>, avg_chunk_size: u32) -> Result<proto::path::Node> {
|
||||
match node {
|
||||
nar::Node::Symlink { target } => Ok(proto::path::Node::Symlink(proto::SymlinkNode {
|
||||
name,
|
||||
target,
|
||||
})),
|
||||
|
||||
nar::Node::Directory(mut reader) => {
|
||||
let mut directories = vec![];
|
||||
let mut files = vec![];
|
||||
let mut symlinks = vec![];
|
||||
|
||||
while let Some(node) = reader.next()? {
|
||||
match ingest(node.node, node.name.to_owned(), avg_chunk_size)? {
|
||||
proto::path::Node::Directory(node) => {
|
||||
directories.push(node);
|
||||
}
|
||||
proto::path::Node::File(node) => {
|
||||
files.push(node);
|
||||
}
|
||||
proto::path::Node::Symlink(node) => {
|
||||
symlinks.push(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(proto::path::Node::Directory(proto::DirectoryNode {
|
||||
name,
|
||||
directories,
|
||||
files,
|
||||
symlinks,
|
||||
}))
|
||||
}
|
||||
|
||||
nar::Node::File { executable, reader } => {
|
||||
let mut reader = B3Reader::from(reader);
|
||||
let mut chunks = vec![];
|
||||
|
||||
for chunk in StreamCDC::new(
|
||||
&mut reader,
|
||||
avg_chunk_size / 2,
|
||||
avg_chunk_size,
|
||||
avg_chunk_size * 2,
|
||||
) {
|
||||
let ChunkData {
|
||||
length: size, data, ..
|
||||
} = chunk?;
|
||||
|
||||
let hash = blake3::hash(&data);
|
||||
let size_compressed = zstd_size(&data, 9);
|
||||
|
||||
chunks.push(proto::Chunk {
|
||||
hash: hash.as_bytes().as_slice().into(),
|
||||
size: size.try_into().unwrap(),
|
||||
size_compressed: size_compressed.try_into().unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(proto::path::Node::File(proto::FileNode {
|
||||
name,
|
||||
hash: reader.finalize().as_bytes().as_slice().into(),
|
||||
chunks,
|
||||
executable,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Sha256Reader<R> {
|
||||
inner: R,
|
||||
hasher: Sha256,
|
||||
buf: *const [u8],
|
||||
}
|
||||
|
||||
const ZERO_BUF: *const [u8] = ptr::slice_from_raw_parts(1 as *const u8, 0);
|
||||
|
||||
unsafe impl<R: Send> Send for Sha256Reader<R> {}
|
||||
|
||||
impl<R> From<R> for Sha256Reader<R> {
|
||||
fn from(value: R) -> Self {
|
||||
Self {
|
||||
inner: value,
|
||||
hasher: Sha256::new(),
|
||||
buf: ZERO_BUF,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Read for Sha256Reader<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.buf = ZERO_BUF;
|
||||
let n = self.inner.read(buf)?;
|
||||
self.hasher.update(&buf[..n]);
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: BufRead> BufRead for Sha256Reader<R> {
|
||||
fn fill_buf(&mut self) -> io::Result<&[u8]> {
|
||||
self.buf = ZERO_BUF;
|
||||
let buf = self.inner.fill_buf()?;
|
||||
self.buf = buf as *const [u8];
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
fn consume(&mut self, amt: usize) {
|
||||
// UNSAFETY: This assumes that `R::consume` doesn't invalidate the buffer.
|
||||
// That's not a sound assumption in general, though it is likely to hold.
|
||||
// TODO(edef): refactor this codebase to write a fresh NAR for verification purposes
|
||||
// we already buffer full chunks, so there's no pressing need to reuse the input buffers
|
||||
unsafe {
|
||||
let (head, buf) = (*self.buf).split_at(amt);
|
||||
self.buf = buf as *const [u8];
|
||||
self.hasher.update(head);
|
||||
self.inner.consume(amt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> Sha256Reader<R> {
|
||||
fn finalize(self) -> [u8; 32] {
|
||||
self.hasher.finalize().into()
|
||||
}
|
||||
}
|
||||
|
||||
struct B3Reader<R> {
|
||||
inner: R,
|
||||
hasher: blake3::Hasher,
|
||||
}
|
||||
|
||||
impl<R> From<R> for B3Reader<R> {
|
||||
fn from(value: R) -> Self {
|
||||
Self {
|
||||
inner: value,
|
||||
hasher: blake3::Hasher::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Read for B3Reader<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let n = self.inner.read(buf)?;
|
||||
self.hasher.update(&buf[..n]);
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> B3Reader<R> {
|
||||
fn finalize(self) -> blake3::Hash {
|
||||
self.hasher.finalize()
|
||||
}
|
||||
}
|
||||
|
||||
fn zstd_size(data: &[u8], level: i32) -> u64 {
|
||||
let mut w = zstd::Encoder::new(CountingWriter::default(), level).unwrap();
|
||||
w.write_all(&data).unwrap();
|
||||
let CountingWriter(size) = w.finish().unwrap();
|
||||
size
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct CountingWriter(u64);
|
||||
|
||||
impl Write for CountingWriter {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.0 += buf.len() as u64;
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
211
contrib/crunch-v2/src/remote.rs
Normal file
211
contrib/crunch-v2/src/remote.rs
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
use std::{
|
||||
cmp,
|
||||
io::{self, BufRead, BufReader, Read},
|
||||
pin::Pin,
|
||||
task::{self, Poll},
|
||||
};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{future::BoxFuture, Future, FutureExt, Stream, StreamExt};
|
||||
use lazy_static::lazy_static;
|
||||
use tokio::runtime::Handle;
|
||||
|
||||
use nix_compat::nixbase32;
|
||||
|
||||
use rusoto_core::{ByteStream, Region};
|
||||
use rusoto_s3::{GetObjectOutput, GetObjectRequest, S3Client, S3};
|
||||
|
||||
use bzip2::read::BzDecoder;
|
||||
use xz2::read::XzDecoder;
|
||||
|
||||
lazy_static! {
|
||||
static ref S3_CLIENT: S3Client = S3Client::new(Region::UsEast1);
|
||||
}
|
||||
|
||||
const BUCKET: &str = "nix-cache";
|
||||
|
||||
pub async fn nar(
|
||||
file_hash: [u8; 32],
|
||||
compression: &str,
|
||||
) -> Result<Box<BufReader<dyn Read + Send>>> {
|
||||
let (extension, decompress): (&'static str, fn(_) -> Box<_>) = match compression {
|
||||
"bzip2" => ("bz2", decompress_bz2),
|
||||
"xz" => ("xz", decompress_xz),
|
||||
_ => bail!("unknown compression: {compression}"),
|
||||
};
|
||||
|
||||
Ok(decompress(
|
||||
FileStream::new(FileKey {
|
||||
file_hash,
|
||||
extension,
|
||||
})
|
||||
.await?
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn decompress_xz(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
|
||||
Box::new(BufReader::new(XzDecoder::new(reader)))
|
||||
}
|
||||
|
||||
fn decompress_bz2(reader: FileStreamReader) -> Box<BufReader<dyn Read + Send>> {
|
||||
Box::new(BufReader::new(BzDecoder::new(reader)))
|
||||
}
|
||||
|
||||
struct FileStreamReader {
|
||||
inner: FileStream,
|
||||
buffer: Bytes,
|
||||
}
|
||||
|
||||
impl From<FileStream> for FileStreamReader {
|
||||
fn from(value: FileStream) -> Self {
|
||||
FileStreamReader {
|
||||
inner: value,
|
||||
buffer: Bytes::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for FileStreamReader {
|
||||
fn read(&mut self, dst: &mut [u8]) -> io::Result<usize> {
|
||||
let src = self.fill_buf()?;
|
||||
let n = cmp::min(src.len(), dst.len());
|
||||
dst[..n].copy_from_slice(&src[..n]);
|
||||
self.consume(n);
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl BufRead for FileStreamReader {
|
||||
fn fill_buf(&mut self) -> io::Result<&[u8]> {
|
||||
if !self.buffer.is_empty() {
|
||||
return Ok(&self.buffer);
|
||||
}
|
||||
|
||||
self.buffer = Handle::current()
|
||||
.block_on(self.inner.next())
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(&self.buffer)
|
||||
}
|
||||
|
||||
fn consume(&mut self, cnt: usize) {
|
||||
self.buffer.advance(cnt);
|
||||
}
|
||||
}
|
||||
|
||||
struct FileKey {
|
||||
file_hash: [u8; 32],
|
||||
extension: &'static str,
|
||||
}
|
||||
|
||||
impl FileKey {
|
||||
fn get(
|
||||
&self,
|
||||
offset: u64,
|
||||
e_tag: Option<&str>,
|
||||
) -> impl Future<Output = io::Result<GetObjectOutput>> + Send + 'static {
|
||||
let input = GetObjectRequest {
|
||||
bucket: BUCKET.to_string(),
|
||||
key: format!(
|
||||
"nar/{}.nar.{}",
|
||||
nixbase32::encode(&self.file_hash),
|
||||
self.extension
|
||||
),
|
||||
if_match: e_tag.map(str::to_owned),
|
||||
range: Some(format!("bytes {}-", offset + 1)),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
async {
|
||||
S3_CLIENT
|
||||
.get_object(input)
|
||||
.await
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct FileStream {
|
||||
key: FileKey,
|
||||
e_tag: String,
|
||||
offset: u64,
|
||||
length: u64,
|
||||
inner: FileStreamState,
|
||||
}
|
||||
|
||||
enum FileStreamState {
|
||||
Response(BoxFuture<'static, io::Result<GetObjectOutput>>),
|
||||
Body(ByteStream),
|
||||
Eof,
|
||||
}
|
||||
|
||||
impl FileStream {
|
||||
pub async fn new(key: FileKey) -> io::Result<Self> {
|
||||
let resp = key.get(0, None).await?;
|
||||
|
||||
Ok(FileStream {
|
||||
key,
|
||||
e_tag: resp.e_tag.unwrap(),
|
||||
offset: 0,
|
||||
length: resp.content_length.unwrap().try_into().unwrap(),
|
||||
inner: FileStreamState::Body(resp.body.unwrap()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! poll {
|
||||
($expr:expr) => {
|
||||
match $expr {
|
||||
Poll::Pending => {
|
||||
return Poll::Pending;
|
||||
}
|
||||
Poll::Ready(value) => value,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl Stream for FileStream {
|
||||
type Item = io::Result<Bytes>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut task::Context) -> Poll<Option<Self::Item>> {
|
||||
let this = self.get_mut();
|
||||
|
||||
let chunk = loop {
|
||||
match &mut this.inner {
|
||||
FileStreamState::Response(resp) => match poll!(resp.poll_unpin(cx)) {
|
||||
Err(err) => {
|
||||
this.inner = FileStreamState::Eof;
|
||||
return Poll::Ready(Some(Err(err)));
|
||||
}
|
||||
Ok(resp) => {
|
||||
this.inner = FileStreamState::Body(resp.body.unwrap());
|
||||
}
|
||||
},
|
||||
FileStreamState::Body(body) => match poll!(body.poll_next_unpin(cx)) {
|
||||
None | Some(Err(_)) => {
|
||||
this.inner = FileStreamState::Response(
|
||||
this.key.get(this.offset, Some(&this.e_tag)).boxed(),
|
||||
);
|
||||
}
|
||||
Some(Ok(chunk)) => {
|
||||
break chunk;
|
||||
}
|
||||
},
|
||||
FileStreamState::Eof => {
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
this.offset += chunk.len() as u64;
|
||||
|
||||
if this.offset >= this.length {
|
||||
this.inner = FileStreamState::Eof;
|
||||
}
|
||||
|
||||
Poll::Ready(Some(Ok(chunk)))
|
||||
}
|
||||
}
|
||||
2
contrib/fetchroots/.gitignore
vendored
Normal file
2
contrib/fetchroots/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
/roots.parquet
|
||||
3417
contrib/fetchroots/Cargo.lock
generated
Normal file
3417
contrib/fetchroots/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
12830
contrib/fetchroots/Cargo.nix
Normal file
12830
contrib/fetchroots/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
23
contrib/fetchroots/Cargo.toml
Normal file
23
contrib/fetchroots/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "fetchroots"
|
||||
version = "0.0.0"
|
||||
edition = "2021"
|
||||
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.80", features = ["backtrace"] }
|
||||
aws-config = "1.1.6"
|
||||
aws-sdk-s3 = "1.16.0"
|
||||
bytes = "1.5.0"
|
||||
bytes-utils = "0.1.4"
|
||||
bzip2 = "0.4.4"
|
||||
chrono = "0.4.34"
|
||||
futures = "0.3.30"
|
||||
indicatif = "0.17.8"
|
||||
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
|
||||
polars = { version = "0.36.2", features = ["parquet"] }
|
||||
rayon = "1.8.1"
|
||||
tokio = { version = "1.36.0", features = ["full"] }
|
||||
xz2 = "0.1.7"
|
||||
36
contrib/fetchroots/README.md
Normal file
36
contrib/fetchroots/README.md
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# fetchroots
|
||||
|
||||
> This tool is part of a suite of tools built to manage cache.nixos.org.
|
||||
|
||||
This tool's purpose is to build an index of all the GC roots from the
|
||||
channels.nixos.org releases. The result is then combined with other tools.
|
||||
|
||||
It does this by:
|
||||
1. Listing all the release files in the bucket.
|
||||
2. Getting the data for each of the release.
|
||||
3. Putting them in a local parquet file.
|
||||
|
||||
## Getting started
|
||||
|
||||
In order to run this, you'll need AWS SSO credentials from the NixOS Infra team.
|
||||
|
||||
Get the creds from https://nixos.awsapps.com/start/ -> LBNixOS_Dev_PDX -> AWSReadOnlyAccess.
|
||||
|
||||
Run `mg run`, you should see a progress bar.
|
||||
|
||||
Congrats, you now have a `roots.parquet` file. You can now load it with python polars-rs or clickhouse.
|
||||
|
||||
## `roots.parquet` file format
|
||||
|
||||
* `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
|
||||
* `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
|
||||
* `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
|
||||
|
||||
## Development
|
||||
|
||||
When the Cargo.lock changes, run `mg run //tools:crate2nix-generate`.
|
||||
|
||||
To build the project, run `mg build`.
|
||||
|
||||
To get a dev environment, run `nix-shell -p cargo`.
|
||||
|
||||
11
contrib/fetchroots/default.nix
Normal file
11
contrib/fetchroots/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{ pkgs, depot, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
fetchroots = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
257
contrib/fetchroots/src/main.rs
Normal file
257
contrib/fetchroots/src/main.rs
Normal file
|
|
@ -0,0 +1,257 @@
|
|||
//! Fetch all[^1] GC roots from releases.nixos.org into a `roots.parquet` file.
|
||||
//!
|
||||
//! The resulting Parquet has three columns:
|
||||
//!
|
||||
//! * `key` (`String`): the release, eg `nixos/22.11-small/nixos-22.11.513.563dc6476b8`
|
||||
//! * `timestamp` (`DateTime`): the timestamp of the GC roots file for this release
|
||||
//! * `store_path_hash` (`List[Binary]`): hash part of the store paths rooted by this release
|
||||
//!
|
||||
//! [^1]: some roots are truly ancient, and aren't compatible with Nix 1.x
|
||||
|
||||
use anyhow::Result;
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs::File,
|
||||
io::{BufRead, Read},
|
||||
sync::Arc,
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use aws_config::Region;
|
||||
use aws_sdk_s3::operation::get_object::builders::GetObjectFluentBuilder;
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes_utils::SegmentedBuf;
|
||||
use chrono::{DateTime, Utc};
|
||||
use nix_compat::nixbase32;
|
||||
use polars::prelude::*;
|
||||
use tokio::{
|
||||
sync::Semaphore,
|
||||
task::{block_in_place, JoinSet},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Meta {
|
||||
format: Format,
|
||||
e_tag: String,
|
||||
last_modified: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let sdk_config = aws_config::load_defaults(aws_config::BehaviorVersion::v2023_11_09())
|
||||
.await
|
||||
.into_builder()
|
||||
.region(Region::from_static("eu-west-1"))
|
||||
.build();
|
||||
|
||||
let s3 = aws_sdk_s3::Client::new(&sdk_config);
|
||||
|
||||
let mut keys: BTreeMap<String, Meta> = {
|
||||
let pages = s3
|
||||
.list_objects_v2()
|
||||
.bucket("nix-releases")
|
||||
.into_paginator()
|
||||
.send()
|
||||
.try_collect()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let objects = pages.into_iter().flat_map(|page| {
|
||||
assert_eq!(page.prefix().unwrap_or_default(), "");
|
||||
assert!(page.common_prefixes.is_none());
|
||||
page.contents.unwrap_or_default()
|
||||
});
|
||||
|
||||
let mut prev_key = String::new();
|
||||
objects
|
||||
.filter_map(|obj| {
|
||||
let key = obj.key().unwrap();
|
||||
|
||||
assert!(&*prev_key < key);
|
||||
key.clone_into(&mut prev_key);
|
||||
|
||||
let (key, tail) = key.rsplit_once('/')?;
|
||||
// Our preference order happens to match lexicographical order,
|
||||
// and listings are returned in lexicographical order.
|
||||
let format = match tail {
|
||||
"MANIFEST" => Format::Manifest,
|
||||
"MANIFEST.bz2" => Format::ManifestBz,
|
||||
"store-paths.xz" => Format::StorePathsXz,
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
Some((
|
||||
key.to_owned(),
|
||||
Meta {
|
||||
format,
|
||||
e_tag: obj.e_tag.unwrap(),
|
||||
last_modified: SystemTime::try_from(obj.last_modified.unwrap())
|
||||
.unwrap()
|
||||
.into(),
|
||||
},
|
||||
))
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
// These releases are so old they don't even use nixbase32 store paths.
|
||||
for key in [
|
||||
"nix/nix-0.6",
|
||||
"nix/nix-0.6.1",
|
||||
"nix/nix-0.7",
|
||||
"nix/nix-0.8",
|
||||
"nixpkgs/nixpkgs-0.5",
|
||||
"nixpkgs/nixpkgs-0.5.1",
|
||||
"nixpkgs/nixpkgs-0.6",
|
||||
"nixpkgs/nixpkgs-0.7",
|
||||
"nixpkgs/nixpkgs-0.8",
|
||||
"nixpkgs/nixpkgs-0.9",
|
||||
"nixpkgs/nixpkgs-0.10",
|
||||
"nixpkgs/nixpkgs-0.11",
|
||||
] {
|
||||
assert!(keys.remove(key).is_some());
|
||||
}
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
let sem = Arc::new(Semaphore::new(16));
|
||||
|
||||
let bar = indicatif::ProgressBar::new(keys.len() as u64);
|
||||
for (root, meta) in keys {
|
||||
let sem = sem.clone();
|
||||
let s3 = s3.clone();
|
||||
|
||||
js.spawn(async move {
|
||||
let _permit = sem.acquire().await.unwrap();
|
||||
|
||||
let body = get_object(
|
||||
s3.get_object()
|
||||
.bucket("nix-releases")
|
||||
.key(format!("{root}/{}", meta.format.as_str()))
|
||||
.if_match(meta.e_tag),
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.reader();
|
||||
|
||||
let ph_array = block_in_place(|| meta.format.to_ph_array(body).rechunk());
|
||||
df! {
|
||||
"key" => [root],
|
||||
"timestamp" => [meta.last_modified.naive_utc()],
|
||||
"store_path_hash" => ph_array.into_series().implode().unwrap()
|
||||
}
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
let mut writer = ParquetWriter::new(File::create("roots.parquet").unwrap())
|
||||
.batched(&Schema::from_iter([
|
||||
Field::new("key", DataType::String),
|
||||
Field::new(
|
||||
"timestamp",
|
||||
DataType::Datetime(TimeUnit::Milliseconds, None),
|
||||
),
|
||||
Field::new(
|
||||
"store_path_hash",
|
||||
DataType::List(Box::new(DataType::Binary)),
|
||||
),
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
while let Some(df) = js.join_next().await.transpose().unwrap() {
|
||||
block_in_place(|| writer.write_batch(&df)).unwrap();
|
||||
bar.inc(1);
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Format {
|
||||
Manifest,
|
||||
ManifestBz,
|
||||
StorePathsXz,
|
||||
}
|
||||
|
||||
impl Format {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Format::Manifest => "MANIFEST",
|
||||
Format::ManifestBz => "MANIFEST.bz2",
|
||||
Format::StorePathsXz => "store-paths.xz",
|
||||
}
|
||||
}
|
||||
|
||||
fn to_ph_array(&self, mut body: impl BufRead) -> BinaryChunked {
|
||||
match self {
|
||||
Format::Manifest | Format::ManifestBz => {
|
||||
let mut buf = String::new();
|
||||
match self {
|
||||
Format::Manifest => {
|
||||
body.read_to_string(&mut buf).unwrap();
|
||||
}
|
||||
Format::ManifestBz => {
|
||||
bzip2::bufread::BzDecoder::new(body)
|
||||
.read_to_string(&mut buf)
|
||||
.unwrap();
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
let buf = buf
|
||||
.strip_prefix("version {\n ManifestVersion: 3\n}\n")
|
||||
.unwrap();
|
||||
|
||||
BinaryChunked::from_iter_values(
|
||||
"store_path_hash",
|
||||
buf.split_terminator("}\n").map(|chunk| -> [u8; 20] {
|
||||
let chunk = chunk.strip_prefix("patch ").unwrap_or(chunk);
|
||||
let line = chunk.strip_prefix("{\n StorePath: /nix/store/").unwrap();
|
||||
nixbase32::decode_fixed(&line[..32]).unwrap()
|
||||
}),
|
||||
)
|
||||
}
|
||||
Format::StorePathsXz => {
|
||||
let mut buf = String::new();
|
||||
xz2::bufread::XzDecoder::new(body)
|
||||
.read_to_string(&mut buf)
|
||||
.unwrap();
|
||||
|
||||
BinaryChunked::from_iter_values(
|
||||
"store_path_hash",
|
||||
buf.split_terminator('\n').map(|line| -> [u8; 20] {
|
||||
let line = line.strip_prefix("/nix/store/").unwrap();
|
||||
nixbase32::decode_fixed(&line[..32]).unwrap()
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_object(request: GetObjectFluentBuilder) -> Result<SegmentedBuf<Bytes>> {
|
||||
// if we don't constrain the ETag, we might experience read skew
|
||||
assert!(request.get_if_match().is_some(), "if_match must be set");
|
||||
|
||||
let mut buf: SegmentedBuf<Bytes> = SegmentedBuf::new();
|
||||
let mut resp = request.clone().send().await?;
|
||||
let content_length: usize = resp.content_length.unwrap().try_into().unwrap();
|
||||
|
||||
loop {
|
||||
while let Ok(Some(chunk)) = resp.body.try_next().await {
|
||||
buf.push(chunk);
|
||||
}
|
||||
|
||||
if buf.remaining() >= content_length {
|
||||
assert_eq!(buf.remaining(), content_length, "got excess bytes");
|
||||
break Ok(buf);
|
||||
}
|
||||
|
||||
resp = request
|
||||
.clone()
|
||||
.range(format!("bytes={}-", buf.remaining()))
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
assert_ne!(resp.content_range, None);
|
||||
}
|
||||
}
|
||||
2304
contrib/narinfo2parquet/Cargo.lock
generated
Normal file
2304
contrib/narinfo2parquet/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
9308
contrib/narinfo2parquet/Cargo.nix
Normal file
9308
contrib/narinfo2parquet/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
28
contrib/narinfo2parquet/Cargo.toml
Normal file
28
contrib/narinfo2parquet/Cargo.toml
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
[package]
|
||||
name = "narinfo2parquet"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# We can't join the //tvix workspace, because that locks zstd
|
||||
# at an ancient version, which is incompatible with polars
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.75", features = ["backtrace"] }
|
||||
jemallocator = "0.5.4"
|
||||
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
|
||||
tempfile-fast = "0.3.4"
|
||||
zstd = "0.13.0"
|
||||
|
||||
# See https://github.com/pola-rs/polars/issues/19157
|
||||
hashbrown = { version = "0.14.5", features = ["raw"] }
|
||||
|
||||
[dependencies.polars]
|
||||
version = "0.36.2"
|
||||
default-features = false
|
||||
features = [
|
||||
"parquet",
|
||||
"polars-io",
|
||||
"dtype-categorical"
|
||||
]
|
||||
1
contrib/narinfo2parquet/OWNERS
Normal file
1
contrib/narinfo2parquet/OWNERS
Normal file
|
|
@ -0,0 +1 @@
|
|||
edef
|
||||
11
contrib/narinfo2parquet/default.nix
Normal file
11
contrib/narinfo2parquet/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{ pkgs, depot, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
narinfo2parquet = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
264
contrib/narinfo2parquet/src/main.rs
Normal file
264
contrib/narinfo2parquet/src/main.rs
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
//! narinfo2parquet operates on a narinfo.zst directory produced by turbofetch.
|
||||
//! It takes the name of a segment file in `narinfo.zst` and writes a Parquet file
|
||||
//! with the same name into the `narinfo.pq` directory.
|
||||
//!
|
||||
//! Run it under GNU Parallel for parallelism:
|
||||
//! ```shell
|
||||
//! mkdir narinfo.pq && ls narinfo.zst | parallel --bar 'narinfo2parquet {}'
|
||||
//! ```
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use jemallocator::Jemalloc;
|
||||
use nix_compat::{
|
||||
narinfo::{self, NarInfo},
|
||||
nixbase32,
|
||||
};
|
||||
use polars::{io::parquet::ParquetWriter, prelude::*};
|
||||
use std::{
|
||||
fs::{self, File},
|
||||
io::{self, BufRead, BufReader, Read},
|
||||
path::Path,
|
||||
};
|
||||
use tempfile_fast::PersistableTempFile;
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: Jemalloc = Jemalloc;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let file_name = std::env::args().nth(1).expect("file name missing");
|
||||
let input_path = Path::new("narinfo.zst").join(&file_name);
|
||||
let output_path = Path::new("narinfo.pq").join(&file_name);
|
||||
|
||||
match fs::metadata(&output_path) {
|
||||
Err(e) if e.kind() == io::ErrorKind::NotFound => {}
|
||||
Err(e) => bail!(e),
|
||||
Ok(_) => bail!("output path already exists: {output_path:?}"),
|
||||
}
|
||||
|
||||
let reader = File::open(input_path).and_then(zstd::Decoder::new)?;
|
||||
let mut frame = FrameBuilder::default();
|
||||
|
||||
for_each(reader, |s| {
|
||||
let entry = NarInfo::parse(&s).context("couldn't parse entry:\n{s}")?;
|
||||
frame.push(&entry);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
let mut frame = frame.finish();
|
||||
let mut writer = PersistableTempFile::new_in(output_path.parent().unwrap())?;
|
||||
|
||||
ParquetWriter::new(&mut writer)
|
||||
.with_compression(ParquetCompression::Gzip(None))
|
||||
.with_statistics(true)
|
||||
.finish(frame.align_chunks())?;
|
||||
|
||||
writer
|
||||
.persist_noclobber(output_path)
|
||||
.map_err(|e| e.error)
|
||||
.context("couldn't commit output file")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn for_each(reader: impl Read, mut f: impl FnMut(&str) -> Result<()>) -> Result<()> {
|
||||
let mut reader = BufReader::new(reader);
|
||||
let mut group = String::new();
|
||||
loop {
|
||||
let prev_len = group.len();
|
||||
|
||||
if prev_len > 1024 * 1024 {
|
||||
bail!("excessively large segment");
|
||||
}
|
||||
|
||||
reader.read_line(&mut group)?;
|
||||
let (prev, line) = group.split_at(prev_len);
|
||||
|
||||
// EOF
|
||||
if line.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
// skip empty line
|
||||
if line == "\n" {
|
||||
group.pop().unwrap();
|
||||
continue;
|
||||
}
|
||||
|
||||
if !prev.is_empty() && line.starts_with("StorePath:") {
|
||||
f(prev)?;
|
||||
group.drain(..prev_len);
|
||||
}
|
||||
}
|
||||
|
||||
if !group.is_empty() {
|
||||
f(&group)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// [FrameBuilder] builds a [DataFrame] out of [NarInfo]s.
|
||||
/// The exact format is still in flux.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// |narinfos: &[NarInfo]| -> DataFrame {
|
||||
/// let frame_builder = FrameBuilder::default();
|
||||
/// narinfos.for_each(|n| frame_builder.push(n));
|
||||
/// frame_builder.finish()
|
||||
/// }
|
||||
/// ```
|
||||
struct FrameBuilder {
|
||||
store_path_hash_str: StringChunkedBuilder,
|
||||
store_path_hash: BinaryChunkedBuilder,
|
||||
store_path_name: StringChunkedBuilder,
|
||||
deriver_hash_str: StringChunkedBuilder,
|
||||
deriver_hash: BinaryChunkedBuilder,
|
||||
deriver_name: StringChunkedBuilder,
|
||||
nar_hash: BinaryChunkedBuilder,
|
||||
nar_size: PrimitiveChunkedBuilder<UInt64Type>,
|
||||
references: ListBinaryChunkedBuilder,
|
||||
ca_algo: CategoricalChunkedBuilder<'static>,
|
||||
ca_hash: BinaryChunkedBuilder,
|
||||
signature: BinaryChunkedBuilder,
|
||||
file_hash: BinaryChunkedBuilder,
|
||||
file_size: PrimitiveChunkedBuilder<UInt64Type>,
|
||||
compression: CategoricalChunkedBuilder<'static>,
|
||||
quirk_references_out_of_order: BooleanChunkedBuilder,
|
||||
quirk_nar_hash_hex: BooleanChunkedBuilder,
|
||||
}
|
||||
|
||||
impl Default for FrameBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
store_path_hash_str: StringChunkedBuilder::new("store_path_hash_str", 0, 0),
|
||||
store_path_hash: BinaryChunkedBuilder::new("store_path_hash", 0, 0),
|
||||
store_path_name: StringChunkedBuilder::new("store_path_name", 0, 0),
|
||||
deriver_hash_str: StringChunkedBuilder::new("deriver_hash_str", 0, 0),
|
||||
deriver_hash: BinaryChunkedBuilder::new("deriver_hash", 0, 0),
|
||||
deriver_name: StringChunkedBuilder::new("deriver_name", 0, 0),
|
||||
nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
|
||||
nar_size: PrimitiveChunkedBuilder::new("nar_size", 0),
|
||||
references: ListBinaryChunkedBuilder::new("references", 0, 0),
|
||||
signature: BinaryChunkedBuilder::new("signature", 0, 0),
|
||||
ca_algo: CategoricalChunkedBuilder::new("ca_algo", 0, CategoricalOrdering::Lexical),
|
||||
ca_hash: BinaryChunkedBuilder::new("ca_hash", 0, 0),
|
||||
file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
|
||||
file_size: PrimitiveChunkedBuilder::new("file_size", 0),
|
||||
compression: CategoricalChunkedBuilder::new(
|
||||
"compression",
|
||||
0,
|
||||
CategoricalOrdering::Lexical,
|
||||
),
|
||||
quirk_references_out_of_order: BooleanChunkedBuilder::new(
|
||||
"quirk_references_out_of_order",
|
||||
0,
|
||||
),
|
||||
quirk_nar_hash_hex: BooleanChunkedBuilder::new("quirk_nar_hash_hex", 0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FrameBuilder {
|
||||
fn push(&mut self, entry: &NarInfo) {
|
||||
self.store_path_hash_str
|
||||
.append_value(nixbase32::encode(entry.store_path.digest()));
|
||||
self.store_path_hash.append_value(entry.store_path.digest());
|
||||
self.store_path_name.append_value(entry.store_path.name());
|
||||
|
||||
if let Some(deriver) = &entry.deriver {
|
||||
self.deriver_hash_str
|
||||
.append_value(nixbase32::encode(deriver.digest()));
|
||||
self.deriver_hash.append_value(deriver.digest());
|
||||
self.deriver_name.append_value(deriver.name());
|
||||
} else {
|
||||
self.deriver_hash_str.append_null();
|
||||
self.deriver_hash.append_null();
|
||||
self.deriver_name.append_null();
|
||||
}
|
||||
|
||||
self.nar_hash.append_value(&entry.nar_hash);
|
||||
self.nar_size.append_value(entry.nar_size);
|
||||
|
||||
self.references
|
||||
.append_values_iter(entry.references.iter().map(|r| r.digest().as_slice()));
|
||||
|
||||
assert!(entry.signatures.len() <= 1);
|
||||
self.signature
|
||||
.append_option(entry.signatures.get(0).map(|sig| {
|
||||
assert_eq!(sig.name(), &"cache.nixos.org-1");
|
||||
sig.bytes()
|
||||
}));
|
||||
|
||||
if let Some(ca) = &entry.ca {
|
||||
self.ca_algo.append_value(ca.algo_str());
|
||||
self.ca_hash.append_value(ca.hash().digest_as_bytes());
|
||||
} else {
|
||||
self.ca_algo.append_null();
|
||||
self.ca_hash.append_null();
|
||||
}
|
||||
|
||||
let file_hash = entry.file_hash.as_ref().unwrap();
|
||||
let file_size = entry.file_size.unwrap();
|
||||
|
||||
self.file_hash.append_value(file_hash);
|
||||
self.file_size.append_value(file_size);
|
||||
|
||||
let (compression, extension) = match entry.compression {
|
||||
Some("bzip2") => ("bzip2", "bz2"),
|
||||
Some("xz") => ("xz", "xz"),
|
||||
Some("zstd") => ("zstd", "zst"),
|
||||
x => panic!("unknown compression algorithm: {x:?}"),
|
||||
};
|
||||
|
||||
self.compression.append_value(compression);
|
||||
|
||||
let mut file_name = nixbase32::encode(file_hash);
|
||||
file_name.push_str(".nar.");
|
||||
file_name.push_str(extension);
|
||||
|
||||
assert_eq!(entry.url.strip_prefix("nar/").unwrap(), file_name);
|
||||
|
||||
{
|
||||
use narinfo::Flags;
|
||||
|
||||
self.quirk_references_out_of_order
|
||||
.append_value(entry.flags.contains(Flags::REFERENCES_OUT_OF_ORDER));
|
||||
|
||||
self.quirk_nar_hash_hex
|
||||
.append_value(entry.flags.contains(Flags::NAR_HASH_HEX));
|
||||
|
||||
let quirks = Flags::REFERENCES_OUT_OF_ORDER | Flags::NAR_HASH_HEX;
|
||||
let unknown_flags = entry.flags.difference(quirks);
|
||||
|
||||
assert!(
|
||||
unknown_flags.is_empty(),
|
||||
"rejecting flags: {unknown_flags:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn finish(mut self) -> DataFrame {
|
||||
df! {
|
||||
"store_path_hash_str" => self.store_path_hash_str.finish().into_series(),
|
||||
"store_path_hash" => self.store_path_hash.finish().into_series(),
|
||||
"store_path_name" => self.store_path_name.finish().into_series(),
|
||||
"deriver_hash_str" => self.deriver_hash_str.finish().into_series(),
|
||||
"deriver_hash" => self.deriver_hash.finish().into_series(),
|
||||
"deriver_name" => self.deriver_name.finish().into_series(),
|
||||
"nar_hash" => self.nar_hash.finish().into_series(),
|
||||
"nar_size" => self.nar_size.finish().into_series(),
|
||||
"references" => self.references.finish().into_series(),
|
||||
"signature" => self.signature.finish().into_series(),
|
||||
"ca_algo" => self.ca_algo.finish().into_series(),
|
||||
"ca_hash" => self.ca_hash.finish().into_series(),
|
||||
"file_hash" => self.file_hash.finish().into_series(),
|
||||
"file_size" => self.file_size.finish().into_series(),
|
||||
"compression" => self.compression.finish().into_series(),
|
||||
"quirk_references_out_of_order" => self.quirk_references_out_of_order.finish().into_series(),
|
||||
"quirk_nar_hash_hex" => self.quirk_nar_hash_hex.finish().into_series()
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
5
contrib/refscan/.gitignore
vendored
Normal file
5
contrib/refscan/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
# SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
/target
|
||||
**/*.rs.bk
|
||||
7
contrib/refscan/Cargo.lock
generated
Normal file
7
contrib/refscan/Cargo.lock
generated
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "refscan"
|
||||
version = "0.1.0"
|
||||
3
contrib/refscan/Cargo.lock.license
Normal file
3
contrib/refscan/Cargo.lock.license
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
10
contrib/refscan/Cargo.toml
Normal file
10
contrib/refscan/Cargo.toml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
[package]
|
||||
name = "refscan"
|
||||
version = "0.1.0"
|
||||
authors = ["edef <edef@edef.eu>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
121
contrib/refscan/LICENSES/CC0-1.0.txt
Normal file
121
contrib/refscan/LICENSES/CC0-1.0.txt
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
Creative Commons Legal Code
|
||||
|
||||
CC0 1.0 Universal
|
||||
|
||||
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
||||
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
||||
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
||||
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
||||
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
||||
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
||||
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
||||
HEREUNDER.
|
||||
|
||||
Statement of Purpose
|
||||
|
||||
The laws of most jurisdictions throughout the world automatically confer
|
||||
exclusive Copyright and Related Rights (defined below) upon the creator
|
||||
and subsequent owner(s) (each and all, an "owner") of an original work of
|
||||
authorship and/or a database (each, a "Work").
|
||||
|
||||
Certain owners wish to permanently relinquish those rights to a Work for
|
||||
the purpose of contributing to a commons of creative, cultural and
|
||||
scientific works ("Commons") that the public can reliably and without fear
|
||||
of later claims of infringement build upon, modify, incorporate in other
|
||||
works, reuse and redistribute as freely as possible in any form whatsoever
|
||||
and for any purposes, including without limitation commercial purposes.
|
||||
These owners may contribute to the Commons to promote the ideal of a free
|
||||
culture and the further production of creative, cultural and scientific
|
||||
works, or to gain reputation or greater distribution for their Work in
|
||||
part through the use and efforts of others.
|
||||
|
||||
For these and/or other purposes and motivations, and without any
|
||||
expectation of additional consideration or compensation, the person
|
||||
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
|
||||
is an owner of Copyright and Related Rights in the Work, voluntarily
|
||||
elects to apply CC0 to the Work and publicly distribute the Work under its
|
||||
terms, with knowledge of his or her Copyright and Related Rights in the
|
||||
Work and the meaning and intended legal effect of CC0 on those rights.
|
||||
|
||||
1. Copyright and Related Rights. A Work made available under CC0 may be
|
||||
protected by copyright and related or neighboring rights ("Copyright and
|
||||
Related Rights"). Copyright and Related Rights include, but are not
|
||||
limited to, the following:
|
||||
|
||||
i. the right to reproduce, adapt, distribute, perform, display,
|
||||
communicate, and translate a Work;
|
||||
ii. moral rights retained by the original author(s) and/or performer(s);
|
||||
iii. publicity and privacy rights pertaining to a person's image or
|
||||
likeness depicted in a Work;
|
||||
iv. rights protecting against unfair competition in regards to a Work,
|
||||
subject to the limitations in paragraph 4(a), below;
|
||||
v. rights protecting the extraction, dissemination, use and reuse of data
|
||||
in a Work;
|
||||
vi. database rights (such as those arising under Directive 96/9/EC of the
|
||||
European Parliament and of the Council of 11 March 1996 on the legal
|
||||
protection of databases, and under any national implementation
|
||||
thereof, including any amended or successor version of such
|
||||
directive); and
|
||||
vii. other similar, equivalent or corresponding rights throughout the
|
||||
world based on applicable law or treaty, and any national
|
||||
implementations thereof.
|
||||
|
||||
2. Waiver. To the greatest extent permitted by, but not in contravention
|
||||
of, applicable law, Affirmer hereby overtly, fully, permanently,
|
||||
irrevocably and unconditionally waives, abandons, and surrenders all of
|
||||
Affirmer's Copyright and Related Rights and associated claims and causes
|
||||
of action, whether now known or unknown (including existing as well as
|
||||
future claims and causes of action), in the Work (i) in all territories
|
||||
worldwide, (ii) for the maximum duration provided by applicable law or
|
||||
treaty (including future time extensions), (iii) in any current or future
|
||||
medium and for any number of copies, and (iv) for any purpose whatsoever,
|
||||
including without limitation commercial, advertising or promotional
|
||||
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
|
||||
member of the public at large and to the detriment of Affirmer's heirs and
|
||||
successors, fully intending that such Waiver shall not be subject to
|
||||
revocation, rescission, cancellation, termination, or any other legal or
|
||||
equitable action to disrupt the quiet enjoyment of the Work by the public
|
||||
as contemplated by Affirmer's express Statement of Purpose.
|
||||
|
||||
3. Public License Fallback. Should any part of the Waiver for any reason
|
||||
be judged legally invalid or ineffective under applicable law, then the
|
||||
Waiver shall be preserved to the maximum extent permitted taking into
|
||||
account Affirmer's express Statement of Purpose. In addition, to the
|
||||
extent the Waiver is so judged Affirmer hereby grants to each affected
|
||||
person a royalty-free, non transferable, non sublicensable, non exclusive,
|
||||
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
||||
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
||||
maximum duration provided by applicable law or treaty (including future
|
||||
time extensions), (iii) in any current or future medium and for any number
|
||||
of copies, and (iv) for any purpose whatsoever, including without
|
||||
limitation commercial, advertising or promotional purposes (the
|
||||
"License"). The License shall be deemed effective as of the date CC0 was
|
||||
applied by Affirmer to the Work. Should any part of the License for any
|
||||
reason be judged legally invalid or ineffective under applicable law, such
|
||||
partial invalidity or ineffectiveness shall not invalidate the remainder
|
||||
of the License, and in such case Affirmer hereby affirms that he or she
|
||||
will not (i) exercise any of his or her remaining Copyright and Related
|
||||
Rights in the Work or (ii) assert any associated claims and causes of
|
||||
action with respect to the Work, in either case contrary to Affirmer's
|
||||
express Statement of Purpose.
|
||||
|
||||
4. Limitations and Disclaimers.
|
||||
|
||||
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
||||
surrendered, licensed or otherwise affected by this document.
|
||||
b. Affirmer offers the Work as-is and makes no representations or
|
||||
warranties of any kind concerning the Work, express, implied,
|
||||
statutory or otherwise, including without limitation warranties of
|
||||
title, merchantability, fitness for a particular purpose, non
|
||||
infringement, or the absence of latent or other defects, accuracy, or
|
||||
the present or absence of errors, whether or not discoverable, all to
|
||||
the greatest extent permissible under applicable law.
|
||||
c. Affirmer disclaims responsibility for clearing rights of other persons
|
||||
that may apply to the Work or any use thereof, including without
|
||||
limitation any person's Copyright and Related Rights in the Work.
|
||||
Further, Affirmer disclaims responsibility for obtaining any necessary
|
||||
consents, permissions or other rights required for any use of the
|
||||
Work.
|
||||
d. Affirmer understands and acknowledges that Creative Commons is not a
|
||||
party to this document and has no duty or obligation with respect to
|
||||
this CC0 or use of the Work.
|
||||
373
contrib/refscan/LICENSES/MPL-2.0.txt
Normal file
373
contrib/refscan/LICENSES/MPL-2.0.txt
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
Mozilla Public License Version 2.0
|
||||
==================================
|
||||
|
||||
1. Definitions
|
||||
--------------
|
||||
|
||||
1.1. "Contributor"
|
||||
means each individual or legal entity that creates, contributes to
|
||||
the creation of, or owns Covered Software.
|
||||
|
||||
1.2. "Contributor Version"
|
||||
means the combination of the Contributions of others (if any) used
|
||||
by a Contributor and that particular Contributor's Contribution.
|
||||
|
||||
1.3. "Contribution"
|
||||
means Covered Software of a particular Contributor.
|
||||
|
||||
1.4. "Covered Software"
|
||||
means Source Code Form to which the initial Contributor has attached
|
||||
the notice in Exhibit A, the Executable Form of such Source Code
|
||||
Form, and Modifications of such Source Code Form, in each case
|
||||
including portions thereof.
|
||||
|
||||
1.5. "Incompatible With Secondary Licenses"
|
||||
means
|
||||
|
||||
(a) that the initial Contributor has attached the notice described
|
||||
in Exhibit B to the Covered Software; or
|
||||
|
||||
(b) that the Covered Software was made available under the terms of
|
||||
version 1.1 or earlier of the License, but not also under the
|
||||
terms of a Secondary License.
|
||||
|
||||
1.6. "Executable Form"
|
||||
means any form of the work other than Source Code Form.
|
||||
|
||||
1.7. "Larger Work"
|
||||
means a work that combines Covered Software with other material, in
|
||||
a separate file or files, that is not Covered Software.
|
||||
|
||||
1.8. "License"
|
||||
means this document.
|
||||
|
||||
1.9. "Licensable"
|
||||
means having the right to grant, to the maximum extent possible,
|
||||
whether at the time of the initial grant or subsequently, any and
|
||||
all of the rights conveyed by this License.
|
||||
|
||||
1.10. "Modifications"
|
||||
means any of the following:
|
||||
|
||||
(a) any file in Source Code Form that results from an addition to,
|
||||
deletion from, or modification of the contents of Covered
|
||||
Software; or
|
||||
|
||||
(b) any new file in Source Code Form that contains any Covered
|
||||
Software.
|
||||
|
||||
1.11. "Patent Claims" of a Contributor
|
||||
means any patent claim(s), including without limitation, method,
|
||||
process, and apparatus claims, in any patent Licensable by such
|
||||
Contributor that would be infringed, but for the grant of the
|
||||
License, by the making, using, selling, offering for sale, having
|
||||
made, import, or transfer of either its Contributions or its
|
||||
Contributor Version.
|
||||
|
||||
1.12. "Secondary License"
|
||||
means either the GNU General Public License, Version 2.0, the GNU
|
||||
Lesser General Public License, Version 2.1, the GNU Affero General
|
||||
Public License, Version 3.0, or any later versions of those
|
||||
licenses.
|
||||
|
||||
1.13. "Source Code Form"
|
||||
means the form of the work preferred for making modifications.
|
||||
|
||||
1.14. "You" (or "Your")
|
||||
means an individual or a legal entity exercising rights under this
|
||||
License. For legal entities, "You" includes any entity that
|
||||
controls, is controlled by, or is under common control with You. For
|
||||
purposes of this definition, "control" means (a) the power, direct
|
||||
or indirect, to cause the direction or management of such entity,
|
||||
whether by contract or otherwise, or (b) ownership of more than
|
||||
fifty percent (50%) of the outstanding shares or beneficial
|
||||
ownership of such entity.
|
||||
|
||||
2. License Grants and Conditions
|
||||
--------------------------------
|
||||
|
||||
2.1. Grants
|
||||
|
||||
Each Contributor hereby grants You a world-wide, royalty-free,
|
||||
non-exclusive license:
|
||||
|
||||
(a) under intellectual property rights (other than patent or trademark)
|
||||
Licensable by such Contributor to use, reproduce, make available,
|
||||
modify, display, perform, distribute, and otherwise exploit its
|
||||
Contributions, either on an unmodified basis, with Modifications, or
|
||||
as part of a Larger Work; and
|
||||
|
||||
(b) under Patent Claims of such Contributor to make, use, sell, offer
|
||||
for sale, have made, import, and otherwise transfer either its
|
||||
Contributions or its Contributor Version.
|
||||
|
||||
2.2. Effective Date
|
||||
|
||||
The licenses granted in Section 2.1 with respect to any Contribution
|
||||
become effective for each Contribution on the date the Contributor first
|
||||
distributes such Contribution.
|
||||
|
||||
2.3. Limitations on Grant Scope
|
||||
|
||||
The licenses granted in this Section 2 are the only rights granted under
|
||||
this License. No additional rights or licenses will be implied from the
|
||||
distribution or licensing of Covered Software under this License.
|
||||
Notwithstanding Section 2.1(b) above, no patent license is granted by a
|
||||
Contributor:
|
||||
|
||||
(a) for any code that a Contributor has removed from Covered Software;
|
||||
or
|
||||
|
||||
(b) for infringements caused by: (i) Your and any other third party's
|
||||
modifications of Covered Software, or (ii) the combination of its
|
||||
Contributions with other software (except as part of its Contributor
|
||||
Version); or
|
||||
|
||||
(c) under Patent Claims infringed by Covered Software in the absence of
|
||||
its Contributions.
|
||||
|
||||
This License does not grant any rights in the trademarks, service marks,
|
||||
or logos of any Contributor (except as may be necessary to comply with
|
||||
the notice requirements in Section 3.4).
|
||||
|
||||
2.4. Subsequent Licenses
|
||||
|
||||
No Contributor makes additional grants as a result of Your choice to
|
||||
distribute the Covered Software under a subsequent version of this
|
||||
License (see Section 10.2) or under the terms of a Secondary License (if
|
||||
permitted under the terms of Section 3.3).
|
||||
|
||||
2.5. Representation
|
||||
|
||||
Each Contributor represents that the Contributor believes its
|
||||
Contributions are its original creation(s) or it has sufficient rights
|
||||
to grant the rights to its Contributions conveyed by this License.
|
||||
|
||||
2.6. Fair Use
|
||||
|
||||
This License is not intended to limit any rights You have under
|
||||
applicable copyright doctrines of fair use, fair dealing, or other
|
||||
equivalents.
|
||||
|
||||
2.7. Conditions
|
||||
|
||||
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
|
||||
in Section 2.1.
|
||||
|
||||
3. Responsibilities
|
||||
-------------------
|
||||
|
||||
3.1. Distribution of Source Form
|
||||
|
||||
All distribution of Covered Software in Source Code Form, including any
|
||||
Modifications that You create or to which You contribute, must be under
|
||||
the terms of this License. You must inform recipients that the Source
|
||||
Code Form of the Covered Software is governed by the terms of this
|
||||
License, and how they can obtain a copy of this License. You may not
|
||||
attempt to alter or restrict the recipients' rights in the Source Code
|
||||
Form.
|
||||
|
||||
3.2. Distribution of Executable Form
|
||||
|
||||
If You distribute Covered Software in Executable Form then:
|
||||
|
||||
(a) such Covered Software must also be made available in Source Code
|
||||
Form, as described in Section 3.1, and You must inform recipients of
|
||||
the Executable Form how they can obtain a copy of such Source Code
|
||||
Form by reasonable means in a timely manner, at a charge no more
|
||||
than the cost of distribution to the recipient; and
|
||||
|
||||
(b) You may distribute such Executable Form under the terms of this
|
||||
License, or sublicense it under different terms, provided that the
|
||||
license for the Executable Form does not attempt to limit or alter
|
||||
the recipients' rights in the Source Code Form under this License.
|
||||
|
||||
3.3. Distribution of a Larger Work
|
||||
|
||||
You may create and distribute a Larger Work under terms of Your choice,
|
||||
provided that You also comply with the requirements of this License for
|
||||
the Covered Software. If the Larger Work is a combination of Covered
|
||||
Software with a work governed by one or more Secondary Licenses, and the
|
||||
Covered Software is not Incompatible With Secondary Licenses, this
|
||||
License permits You to additionally distribute such Covered Software
|
||||
under the terms of such Secondary License(s), so that the recipient of
|
||||
the Larger Work may, at their option, further distribute the Covered
|
||||
Software under the terms of either this License or such Secondary
|
||||
License(s).
|
||||
|
||||
3.4. Notices
|
||||
|
||||
You may not remove or alter the substance of any license notices
|
||||
(including copyright notices, patent notices, disclaimers of warranty,
|
||||
or limitations of liability) contained within the Source Code Form of
|
||||
the Covered Software, except that You may alter any license notices to
|
||||
the extent required to remedy known factual inaccuracies.
|
||||
|
||||
3.5. Application of Additional Terms
|
||||
|
||||
You may choose to offer, and to charge a fee for, warranty, support,
|
||||
indemnity or liability obligations to one or more recipients of Covered
|
||||
Software. However, You may do so only on Your own behalf, and not on
|
||||
behalf of any Contributor. You must make it absolutely clear that any
|
||||
such warranty, support, indemnity, or liability obligation is offered by
|
||||
You alone, and You hereby agree to indemnify every Contributor for any
|
||||
liability incurred by such Contributor as a result of warranty, support,
|
||||
indemnity or liability terms You offer. You may include additional
|
||||
disclaimers of warranty and limitations of liability specific to any
|
||||
jurisdiction.
|
||||
|
||||
4. Inability to Comply Due to Statute or Regulation
|
||||
---------------------------------------------------
|
||||
|
||||
If it is impossible for You to comply with any of the terms of this
|
||||
License with respect to some or all of the Covered Software due to
|
||||
statute, judicial order, or regulation then You must: (a) comply with
|
||||
the terms of this License to the maximum extent possible; and (b)
|
||||
describe the limitations and the code they affect. Such description must
|
||||
be placed in a text file included with all distributions of the Covered
|
||||
Software under this License. Except to the extent prohibited by statute
|
||||
or regulation, such description must be sufficiently detailed for a
|
||||
recipient of ordinary skill to be able to understand it.
|
||||
|
||||
5. Termination
|
||||
--------------
|
||||
|
||||
5.1. The rights granted under this License will terminate automatically
|
||||
if You fail to comply with any of its terms. However, if You become
|
||||
compliant, then the rights granted under this License from a particular
|
||||
Contributor are reinstated (a) provisionally, unless and until such
|
||||
Contributor explicitly and finally terminates Your grants, and (b) on an
|
||||
ongoing basis, if such Contributor fails to notify You of the
|
||||
non-compliance by some reasonable means prior to 60 days after You have
|
||||
come back into compliance. Moreover, Your grants from a particular
|
||||
Contributor are reinstated on an ongoing basis if such Contributor
|
||||
notifies You of the non-compliance by some reasonable means, this is the
|
||||
first time You have received notice of non-compliance with this License
|
||||
from such Contributor, and You become compliant prior to 30 days after
|
||||
Your receipt of the notice.
|
||||
|
||||
5.2. If You initiate litigation against any entity by asserting a patent
|
||||
infringement claim (excluding declaratory judgment actions,
|
||||
counter-claims, and cross-claims) alleging that a Contributor Version
|
||||
directly or indirectly infringes any patent, then the rights granted to
|
||||
You by any and all Contributors for the Covered Software under Section
|
||||
2.1 of this License shall terminate.
|
||||
|
||||
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
|
||||
end user license agreements (excluding distributors and resellers) which
|
||||
have been validly granted by You or Your distributors under this License
|
||||
prior to termination shall survive termination.
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 6. Disclaimer of Warranty *
|
||||
* ------------------------- *
|
||||
* *
|
||||
* Covered Software is provided under this License on an "as is" *
|
||||
* basis, without warranty of any kind, either expressed, implied, or *
|
||||
* statutory, including, without limitation, warranties that the *
|
||||
* Covered Software is free of defects, merchantable, fit for a *
|
||||
* particular purpose or non-infringing. The entire risk as to the *
|
||||
* quality and performance of the Covered Software is with You. *
|
||||
* Should any Covered Software prove defective in any respect, You *
|
||||
* (not any Contributor) assume the cost of any necessary servicing, *
|
||||
* repair, or correction. This disclaimer of warranty constitutes an *
|
||||
* essential part of this License. No use of any Covered Software is *
|
||||
* authorized under this License except under this disclaimer. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 7. Limitation of Liability *
|
||||
* -------------------------- *
|
||||
* *
|
||||
* Under no circumstances and under no legal theory, whether tort *
|
||||
* (including negligence), contract, or otherwise, shall any *
|
||||
* Contributor, or anyone who distributes Covered Software as *
|
||||
* permitted above, be liable to You for any direct, indirect, *
|
||||
* special, incidental, or consequential damages of any character *
|
||||
* including, without limitation, damages for lost profits, loss of *
|
||||
* goodwill, work stoppage, computer failure or malfunction, or any *
|
||||
* and all other commercial damages or losses, even if such party *
|
||||
* shall have been informed of the possibility of such damages. This *
|
||||
* limitation of liability shall not apply to liability for death or *
|
||||
* personal injury resulting from such party's negligence to the *
|
||||
* extent applicable law prohibits such limitation. Some *
|
||||
* jurisdictions do not allow the exclusion or limitation of *
|
||||
* incidental or consequential damages, so this exclusion and *
|
||||
* limitation may not apply to You. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
8. Litigation
|
||||
-------------
|
||||
|
||||
Any litigation relating to this License may be brought only in the
|
||||
courts of a jurisdiction where the defendant maintains its principal
|
||||
place of business and such litigation shall be governed by laws of that
|
||||
jurisdiction, without reference to its conflict-of-law provisions.
|
||||
Nothing in this Section shall prevent a party's ability to bring
|
||||
cross-claims or counter-claims.
|
||||
|
||||
9. Miscellaneous
|
||||
----------------
|
||||
|
||||
This License represents the complete agreement concerning the subject
|
||||
matter hereof. If any provision of this License is held to be
|
||||
unenforceable, such provision shall be reformed only to the extent
|
||||
necessary to make it enforceable. Any law or regulation which provides
|
||||
that the language of a contract shall be construed against the drafter
|
||||
shall not be used to construe this License against a Contributor.
|
||||
|
||||
10. Versions of the License
|
||||
---------------------------
|
||||
|
||||
10.1. New Versions
|
||||
|
||||
Mozilla Foundation is the license steward. Except as provided in Section
|
||||
10.3, no one other than the license steward has the right to modify or
|
||||
publish new versions of this License. Each version will be given a
|
||||
distinguishing version number.
|
||||
|
||||
10.2. Effect of New Versions
|
||||
|
||||
You may distribute the Covered Software under the terms of the version
|
||||
of the License under which You originally received the Covered Software,
|
||||
or under the terms of any subsequent version published by the license
|
||||
steward.
|
||||
|
||||
10.3. Modified Versions
|
||||
|
||||
If you create software not governed by this License, and you want to
|
||||
create a new license for such software, you may create and use a
|
||||
modified version of this License if you rename the license and remove
|
||||
any references to the name of the license steward (except to note that
|
||||
such modified license differs from this License).
|
||||
|
||||
10.4. Distributing Source Code Form that is Incompatible With Secondary
|
||||
Licenses
|
||||
|
||||
If You choose to distribute Source Code Form that is Incompatible With
|
||||
Secondary Licenses under the terms of this version of the License, the
|
||||
notice described in Exhibit B of this License must be attached.
|
||||
|
||||
Exhibit A - Source Code Form License Notice
|
||||
-------------------------------------------
|
||||
|
||||
This Source Code Form is subject to the terms of the Mozilla Public
|
||||
License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
|
||||
If it is not possible or desirable to put the notice in a particular
|
||||
file, then You may include the notice in a location (such as a LICENSE
|
||||
file in a relevant directory) where a recipient would be likely to look
|
||||
for such a notice.
|
||||
|
||||
You may add additional accurate notices of copyright ownership.
|
||||
|
||||
Exhibit B - "Incompatible With Secondary Licenses" Notice
|
||||
---------------------------------------------------------
|
||||
|
||||
This Source Code Form is "Incompatible With Secondary Licenses", as
|
||||
defined by the Mozilla Public License, v. 2.0.
|
||||
154
contrib/refscan/src/lib.rs
Normal file
154
contrib/refscan/src/lib.rs
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
// SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
use self::simd::u8x32;
|
||||
|
||||
fn prefilter(haystack: u8x32) -> u32 {
|
||||
let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
|
||||
let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
|
||||
alp | num
|
||||
}
|
||||
|
||||
/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
|
||||
/// or `Ok(buffer)` if the entire buffer is pointer-free.
|
||||
pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
|
||||
let buffer = {
|
||||
let n = buffer.len() & !31;
|
||||
&buffer[..n]
|
||||
};
|
||||
|
||||
let mut masks = buffer
|
||||
.chunks_exact(32)
|
||||
.map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)))
|
||||
.enumerate()
|
||||
.map(|e| (e.0 * 32, e.1))
|
||||
.peekable();
|
||||
|
||||
while let Some((offset, mask)) = masks.next() {
|
||||
let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
|
||||
let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
|
||||
if n >= 32 {
|
||||
let offset = offset + mask.trailing_zeros() as usize;
|
||||
return Err(&buffer[..offset]);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(buffer)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
#[test]
|
||||
fn scan_tail() {
|
||||
let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
|
||||
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
|
||||
}
|
||||
#[test]
|
||||
fn scan_straddle() {
|
||||
let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
|
||||
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
|
||||
}
|
||||
#[test]
|
||||
fn scan_clean() {
|
||||
let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
|
||||
assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
mod simd {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use std::arch::x86 as arch;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use std::arch::x86_64 as arch;
|
||||
use {
|
||||
arch::{__m256i, _mm256_cmpgt_epi8, _mm256_movemask_epi8, _mm256_set1_epi8},
|
||||
std::ptr,
|
||||
};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct u8x32(__m256i);
|
||||
|
||||
impl u8x32 {
|
||||
#[inline(always)]
|
||||
pub fn from_slice_unaligned(slice: &[u8]) -> Self {
|
||||
assert_eq!(slice.len(), 32);
|
||||
u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn splat(x: u8) -> Self {
|
||||
u8x32(unsafe { _mm256_set1_epi8(x as i8) })
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gt(self, b: Self) -> u32 {
|
||||
unsafe { _mm256_movemask_epi8(_mm256_cmpgt_epi8(self.0, b.0)) as u32 }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn lt(self, b: Self) -> u32 {
|
||||
b.gt(self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
mod simd {
|
||||
use std::{
|
||||
arch::aarch64::{
|
||||
uint8x16_t as u8x16, vaddv_u8, vandq_u8, vcgtq_u8, vdupq_n_u8, vget_high_u8,
|
||||
vget_low_u8, vshlq_u8,
|
||||
},
|
||||
mem, ptr,
|
||||
};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Copy, Clone)]
|
||||
#[repr(transparent)]
|
||||
pub struct u8x32([u8x16; 2]);
|
||||
|
||||
impl u8x32 {
|
||||
#[cfg(target_endian = "little")]
|
||||
#[inline(always)]
|
||||
pub fn from_slice_unaligned(slice: &[u8]) -> Self {
|
||||
assert_eq!(slice.len(), 32);
|
||||
u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) })
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn splat(x: u8) -> Self {
|
||||
u8x32(unsafe {
|
||||
let x = vdupq_n_u8(x);
|
||||
[x, x]
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gt(&self, b: Self) -> u32 {
|
||||
let u8x32([al, ah]) = *self;
|
||||
let u8x32([bl, bh]) = b;
|
||||
|
||||
fn f(a: u8x16, b: u8x16) -> u32 {
|
||||
unsafe {
|
||||
let c = vshlq_u8(
|
||||
vandq_u8(vdupq_n_u8(0x80), vcgtq_u8(a, b)),
|
||||
mem::transmute([
|
||||
-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0i8,
|
||||
]),
|
||||
);
|
||||
|
||||
(vaddv_u8(vget_low_u8(c)) as u32) << 0 | (vaddv_u8(vget_high_u8(c)) as u32) << 8
|
||||
}
|
||||
}
|
||||
|
||||
f(al, bl) << 0 | f(ah, bh) << 16
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn lt(self, b: Self) -> u32 {
|
||||
b.gt(self)
|
||||
}
|
||||
}
|
||||
}
|
||||
58
contrib/refscan/src/main.rs
Normal file
58
contrib/refscan/src/main.rs
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
// SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
use std::{
|
||||
collections::BTreeSet as Set,
|
||||
convert::TryInto,
|
||||
io::{self, Read},
|
||||
str,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
|
||||
.lines()
|
||||
.map(|l| l.as_bytes().try_into().unwrap())
|
||||
.collect();
|
||||
|
||||
let input = {
|
||||
let stdin = io::stdin();
|
||||
let mut buffer = Vec::new();
|
||||
stdin.lock().read_to_end(&mut buffer).unwrap();
|
||||
buffer
|
||||
};
|
||||
|
||||
let base = input.as_ptr() as usize;
|
||||
let mut input: &[u8] = &input;
|
||||
while input.len() >= 32 {
|
||||
match refscan::scan_clean(&input) {
|
||||
Ok(buffer) | Err(buffer) => {
|
||||
let n = buffer.len();
|
||||
input = &input[n..];
|
||||
}
|
||||
}
|
||||
|
||||
let buffer = {
|
||||
let idx = input.iter().position(|x| match x {
|
||||
b'a'..=b'z' | b'0'..=b'9' => false,
|
||||
_ => true,
|
||||
});
|
||||
idx.map(|idx| &input[..idx]).unwrap_or(input)
|
||||
};
|
||||
|
||||
for chunk in buffer.windows(32) {
|
||||
let offset = (chunk.as_ptr() as usize) - base;
|
||||
let chunk = {
|
||||
let mut fixed = [0u8; 32];
|
||||
fixed.copy_from_slice(chunk);
|
||||
fixed
|
||||
};
|
||||
if max_refs.contains(&chunk) {
|
||||
let seen = unsafe { str::from_utf8_unchecked(&chunk) };
|
||||
println!("{} {}", seen, offset);
|
||||
}
|
||||
}
|
||||
|
||||
let n = buffer.len();
|
||||
input = &input[n..];
|
||||
}
|
||||
}
|
||||
6
contrib/refscan/testdata/.gitignore
vendored
Normal file
6
contrib/refscan/testdata/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
# SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
/maxrefs
|
||||
/nar
|
||||
/result
|
||||
8
contrib/refscan/testdata/generate.sh
vendored
Executable file
8
contrib/refscan/testdata/generate.sh
vendored
Executable file
|
|
@ -0,0 +1,8 @@
|
|||
#! /usr/bin/env bash
|
||||
# SPDX-FileCopyrightText: edef <edef@edef.eu>
|
||||
# SPDX-License-Identifier: CC0-1.0
|
||||
set -euo pipefail
|
||||
|
||||
drv=$(nix-instantiate '<nixpkgs>' -A ghc)
|
||||
nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
|
||||
nix-store --dump "$(nix-build "$drv")" > nar
|
||||
1779
contrib/turbofetch/Cargo.lock
generated
Normal file
1779
contrib/turbofetch/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
7145
contrib/turbofetch/Cargo.nix
Normal file
7145
contrib/turbofetch/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
28
contrib/turbofetch/Cargo.toml
Normal file
28
contrib/turbofetch/Cargo.toml
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
[package]
|
||||
name = "turbofetch"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# We don't join the //snix workspace, as this is fairly cache.nixos.org-specific.
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
[dependencies]
|
||||
aws_lambda_events = { version = "0.11.1", default-features = false, features = ["lambda_function_urls"] }
|
||||
bytes = "1.5.0"
|
||||
data-encoding = "2.6.0"
|
||||
futures = { version = "0.3.30", default-features = false, features = ["std"] }
|
||||
httparse = "1.8.0"
|
||||
hyper = { version = "0.14.27", default-features = false }
|
||||
lambda_runtime = "0.8.2"
|
||||
magic-buffer = "0.1.1"
|
||||
rusoto_core = { version = "0.48.0", features = ["rustls"], default-features = false }
|
||||
rusoto_s3 = { version = "0.48.0", features = ["rustls"], default-features = false }
|
||||
serde_json = "1.0.108"
|
||||
serde = { version = "1.0.190", features = ["derive"] }
|
||||
tokio = { version = "1.33.0", features = ["full"] }
|
||||
tower = "0.4.13"
|
||||
# TODO(edef): zstd = "0.13.0"
|
||||
zstd = "0.9.0"
|
||||
tracing-subscriber = { version = "0.3.17", features = ["json"] }
|
||||
tracing = "0.1.40"
|
||||
1
contrib/turbofetch/OWNERS
Normal file
1
contrib/turbofetch/OWNERS
Normal file
|
|
@ -0,0 +1 @@
|
|||
edef
|
||||
11
contrib/turbofetch/default.nix
Normal file
11
contrib/turbofetch/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{ pkgs, depot, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
turbofetch = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
5
contrib/turbofetch/deploy.sh
Executable file
5
contrib/turbofetch/deploy.sh
Executable file
|
|
@ -0,0 +1,5 @@
|
|||
#! /usr/bin/env nix-shell
|
||||
#! nix-shell -i "bash -e"
|
||||
#! nix-shell -p cargo-lambda
|
||||
cargo lambda build --release
|
||||
cargo lambda deploy
|
||||
83
contrib/turbofetch/src/buffer.rs
Normal file
83
contrib/turbofetch/src/buffer.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
use magic_buffer::MagicBuffer;
|
||||
use std::cell::Cell;
|
||||
|
||||
/// Buffer is a FIFO queue for bytes, built on a ring buffer.
|
||||
/// It always provides contiguous slices for both the readable and writable parts,
|
||||
/// using an underlying buffer that is "mirrored" in virtual memory.
|
||||
pub struct Buffer {
|
||||
buffer: MagicBuffer,
|
||||
/// first readable byte
|
||||
head: Cell<usize>,
|
||||
/// first writable byte
|
||||
tail: usize,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
/// Allocate a fresh buffer, with the specified capacity.
|
||||
/// The buffer can contain at most `capacity - 1` bytes.
|
||||
/// The capacity must be a power of two, and at least [Buffer::min_len].
|
||||
pub fn new(capacity: usize) -> Buffer {
|
||||
Buffer {
|
||||
// MagicBuffer::new verifies that `capacity` is a power of two,
|
||||
// and at least MagicBuffer::min_len().
|
||||
buffer: MagicBuffer::new(capacity).unwrap(),
|
||||
// `head == tail` means the buffer is empty.
|
||||
// In order to ensure that this remains unambiguous,
|
||||
// the buffer can only be filled with capacity-1 bytes.
|
||||
head: Cell::new(0),
|
||||
tail: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum buffer capacity.
|
||||
/// This depends on the operating system and architecture.
|
||||
pub fn min_capacity() -> usize {
|
||||
MagicBuffer::min_len()
|
||||
}
|
||||
|
||||
/// Return the capacity of the buffer.
|
||||
/// This is equal to `self.data().len() + self.space().len() + 1`.
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.buffer.len()
|
||||
}
|
||||
|
||||
/// Return the valid, readable data in the buffer.
|
||||
pub fn data(&self) -> &[u8] {
|
||||
let len = self.buffer.len();
|
||||
let head = self.head.get();
|
||||
|
||||
if head <= self.tail {
|
||||
&self.buffer[head..self.tail]
|
||||
} else {
|
||||
&self.buffer[head..self.tail + len]
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark `read_len` bytes of the readable data as consumed, freeing the space.
|
||||
pub fn consume(&self, read_len: usize) {
|
||||
debug_assert!(read_len <= self.data().len());
|
||||
let mut head = self.head.get();
|
||||
head += read_len;
|
||||
head &= self.buffer.len() - 1;
|
||||
self.head.set(head);
|
||||
}
|
||||
|
||||
/// Return the empty, writable space in the buffer.
|
||||
pub fn space(&mut self) -> &mut [u8] {
|
||||
let len = self.buffer.len();
|
||||
let head = self.head.get();
|
||||
|
||||
if head <= self.tail {
|
||||
&mut self.buffer[self.tail..head + len - 1]
|
||||
} else {
|
||||
&mut self.buffer[self.tail..head - 1]
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark `written_len` bytes of the writable space as valid, readable data.
|
||||
pub fn commit(&mut self, written_len: usize) {
|
||||
debug_assert!(written_len <= self.space().len());
|
||||
self.tail += written_len;
|
||||
self.tail &= self.buffer.len() - 1;
|
||||
}
|
||||
}
|
||||
103
contrib/turbofetch/src/lib.rs
Normal file
103
contrib/turbofetch/src/lib.rs
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
use std::{mem::MaybeUninit, str};
|
||||
use tokio::io::{self, AsyncRead, AsyncReadExt};
|
||||
|
||||
pub use buffer::Buffer;
|
||||
mod buffer;
|
||||
|
||||
/// Read as much data into `buffer` as possible.
|
||||
/// Returns [io::ErrorKind::OutOfMemory] if the buffer is already full.
|
||||
async fn slurp(buffer: &mut Buffer, sock: &mut (impl AsyncRead + Unpin)) -> io::Result<()> {
|
||||
match buffer.space() {
|
||||
[] => Err(io::Error::new(io::ErrorKind::OutOfMemory, "buffer filled")),
|
||||
buf => {
|
||||
let n = sock.read(buf).await?;
|
||||
if n == 0 {
|
||||
return Err(io::ErrorKind::UnexpectedEof.into());
|
||||
}
|
||||
buffer.commit(n);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_content_length(headers: &[httparse::Header]) -> io::Result<u64> {
|
||||
for header in headers {
|
||||
if header.name == "Transfer-Encoding" {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Transfer-Encoding is unsupported",
|
||||
));
|
||||
}
|
||||
|
||||
if header.name == "Content-Length" {
|
||||
return str::from_utf8(header.value)
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.ok_or_else(|| {
|
||||
io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Length")
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Content-Length missing",
|
||||
))
|
||||
}
|
||||
|
||||
/// Read an HTTP response from `sock` using `buffer`, returning the response body.
|
||||
/// Returns an error if anything but 200 OK is received.
|
||||
///
|
||||
/// The buffer must have enough space to contain the entire response body.
|
||||
/// If there is not enough space, [io::ErrorKind::OutOfMemory] is returned.
|
||||
///
|
||||
/// The HTTP response must use `Content-Length`, without `Transfer-Encoding`.
|
||||
pub async fn parse_response<'a>(
|
||||
sock: &mut (impl AsyncRead + Unpin),
|
||||
buffer: &'a mut Buffer,
|
||||
) -> io::Result<&'a [u8]> {
|
||||
let body_len = loop {
|
||||
let mut headers = [MaybeUninit::uninit(); 16];
|
||||
let mut response = httparse::Response::new(&mut []);
|
||||
let status = httparse::ParserConfig::default()
|
||||
.parse_response_with_uninit_headers(&mut response, buffer.data(), &mut headers)
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
|
||||
|
||||
if let httparse::Status::Complete(n) = status {
|
||||
buffer.consume(n);
|
||||
|
||||
let code = response.code.unwrap();
|
||||
if code != 200 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("HTTP response {code}"),
|
||||
));
|
||||
}
|
||||
|
||||
break get_content_length(response.headers)?;
|
||||
}
|
||||
|
||||
slurp(buffer, sock).await?;
|
||||
};
|
||||
|
||||
let buf_len = buffer.space().len() + buffer.data().len();
|
||||
|
||||
if body_len > buf_len as u64 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::OutOfMemory,
|
||||
"HTTP response body does not fit in buffer",
|
||||
));
|
||||
}
|
||||
|
||||
let body_len = body_len as usize;
|
||||
|
||||
while buffer.data().len() < body_len {
|
||||
slurp(buffer, sock).await?;
|
||||
}
|
||||
|
||||
let data = buffer.data();
|
||||
buffer.consume(body_len);
|
||||
|
||||
Ok(&data[..body_len])
|
||||
}
|
||||
220
contrib/turbofetch/src/main.rs
Normal file
220
contrib/turbofetch/src/main.rs
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
//! turbofetch is a high-performance bulk S3 object aggregator.
|
||||
//!
|
||||
//! It operates on two S3 buckets: a source bucket (nix-cache), and a
|
||||
//! work bucket defined at runtime. The work bucket contains a job file
|
||||
//! consisting of concatenated 32-character keys, representing narinfo
|
||||
//! files in the source bucket, without the `.narinfo` suffix or any
|
||||
//! other separators.
|
||||
//!
|
||||
//! Each run of turbofetch processes a half-open range of indices from the
|
||||
//! job file, and outputs a zstd stream of concatenated objects, without
|
||||
//! additional separators and in no particular order. These segment files
|
||||
//! are written into the work bucket, named for the range of indices they
|
||||
//! cover. `/narinfo.zst/000000000c380d40-000000000c385b60` covers the 20k
|
||||
//! objects `[0xc380d40, 0xc385b60) = [205000000, 205020000)`. Empirically,
|
||||
//! segment files of 20k objects achieve a compression ratio of 4.7x.
|
||||
//!
|
||||
//! Reassembly is left to narinfo2parquet, which interprets StorePath lines.
|
||||
//!
|
||||
//! TODO(edef): any retries/error handling whatsoever
|
||||
//! Currently, it fails an entire range if anything goes wrong, and doesn't
|
||||
//! write any output.
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::{stream::FuturesUnordered, Stream, TryStreamExt};
|
||||
use rusoto_core::ByteStream;
|
||||
use rusoto_s3::{GetObjectRequest, PutObjectRequest, S3Client, S3};
|
||||
use serde::Deserialize;
|
||||
use std::{io::Write, mem, ops::Range, ptr};
|
||||
use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncWriteExt},
|
||||
net::TcpStream,
|
||||
};
|
||||
|
||||
/// Fetch a group of keys, streaming concatenated chunks as they arrive from S3.
|
||||
/// `keys` must be a slice from the job file. Any network error at all fails the
|
||||
/// entire batch, and there is no rate limiting.
|
||||
fn fetch(keys: &[[u8; 32]]) -> impl Stream<Item = io::Result<Bytes>> {
|
||||
// S3 supports only HTTP/1.1, but we can ease the pain somewhat by using
|
||||
// HTTP pipelining. It terminates the TCP connection after receiving 100
|
||||
// requests, so we chunk the keys up accordingly, and make one connection
|
||||
// for each chunk.
|
||||
keys.chunks(100)
|
||||
.map(|chunk| {
|
||||
const PREFIX: &[u8] = b"GET /nix-cache/";
|
||||
const SUFFIX: &[u8] = b".narinfo HTTP/1.1\nHost: s3.amazonaws.com\n\n";
|
||||
const LENGTH: usize = PREFIX.len() + 32 + SUFFIX.len();
|
||||
|
||||
let mut request = Vec::with_capacity(LENGTH * 100);
|
||||
for key in chunk {
|
||||
request.extend_from_slice(PREFIX);
|
||||
request.extend_from_slice(key);
|
||||
request.extend_from_slice(SUFFIX);
|
||||
}
|
||||
|
||||
(request, chunk.len())
|
||||
})
|
||||
.map(|(request, n)| async move {
|
||||
let (mut read, mut write) = TcpStream::connect("s3.amazonaws.com:80")
|
||||
.await?
|
||||
.into_split();
|
||||
|
||||
let _handle = tokio::spawn(async move {
|
||||
let request = request;
|
||||
write.write_all(&request).await
|
||||
});
|
||||
|
||||
let mut buffer = turbofetch::Buffer::new(512 * 1024);
|
||||
let mut bodies = vec![];
|
||||
|
||||
for _ in 0..n {
|
||||
let body = turbofetch::parse_response(&mut read, &mut buffer).await?;
|
||||
bodies.extend_from_slice(body);
|
||||
}
|
||||
|
||||
Ok::<_, io::Error>(Bytes::from(bodies))
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>()
|
||||
}
|
||||
|
||||
/// Retrieve a range of keys from the job file.
|
||||
async fn get_range(
|
||||
s3: &'static S3Client,
|
||||
bucket: String,
|
||||
key: String,
|
||||
range: Range<u64>,
|
||||
) -> io::Result<Box<[[u8; 32]]>> {
|
||||
let resp = s3
|
||||
.get_object(GetObjectRequest {
|
||||
bucket,
|
||||
key,
|
||||
range: Some(format!("bytes={}-{}", range.start * 32, range.end * 32 - 1)),
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
||||
|
||||
let mut body = vec![];
|
||||
resp.body
|
||||
.ok_or(io::ErrorKind::InvalidData)?
|
||||
.into_async_read()
|
||||
.read_to_end(&mut body)
|
||||
.await?;
|
||||
|
||||
let body = exact_chunks(body.into_boxed_slice()).ok_or(io::ErrorKind::InvalidData)?;
|
||||
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
fn exact_chunks(mut buf: Box<[u8]>) -> Option<Box<[[u8; 32]]>> {
|
||||
// SAFETY: We ensure that `buf.len()` is a multiple of 32, and there are no alignment requirements.
|
||||
unsafe {
|
||||
let ptr = buf.as_mut_ptr();
|
||||
let len = buf.len();
|
||||
|
||||
if len % 32 != 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = ptr as *mut [u8; 32];
|
||||
let len = len / 32;
|
||||
mem::forget(buf);
|
||||
|
||||
Some(Box::from_raw(ptr::slice_from_raw_parts_mut(ptr, len)))
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(edef): factor this out into a separate entry point
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> Result<(), lambda_runtime::Error> {
|
||||
let s3 = S3Client::new(rusoto_core::Region::UsEast1);
|
||||
let s3 = &*Box::leak(Box::new(s3));
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.json()
|
||||
.with_max_level(tracing::Level::INFO)
|
||||
// this needs to be set to remove duplicated information in the log.
|
||||
.with_current_span(false)
|
||||
// this needs to be set to false, otherwise ANSI color codes will
|
||||
// show up in a confusing manner in CloudWatch logs.
|
||||
.with_ansi(false)
|
||||
// disabling time is handy because CloudWatch will add the ingestion time.
|
||||
.without_time()
|
||||
// remove the name of the function from every log entry
|
||||
.with_target(false)
|
||||
.init();
|
||||
|
||||
lambda_runtime::run(lambda_runtime::service_fn(|event| func(s3, event))).await
|
||||
}
|
||||
|
||||
/// Lambda request body
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Params {
|
||||
work_bucket: String,
|
||||
job_file: String,
|
||||
start: u64,
|
||||
end: u64,
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(s3, event), fields(req_id = %event.context.request_id))]
|
||||
async fn func(
|
||||
s3: &'static S3Client,
|
||||
event: lambda_runtime::LambdaEvent<
|
||||
aws_lambda_events::lambda_function_urls::LambdaFunctionUrlRequest,
|
||||
>,
|
||||
) -> Result<&'static str, lambda_runtime::Error> {
|
||||
let mut params = event.payload.body.ok_or("no body")?;
|
||||
|
||||
if event.payload.is_base64_encoded {
|
||||
params = String::from_utf8(data_encoding::BASE64.decode(params.as_bytes())?)?;
|
||||
}
|
||||
|
||||
let params: Params = serde_json::from_str(¶ms)?;
|
||||
|
||||
if params.start >= params.end {
|
||||
return Err("nope".into());
|
||||
}
|
||||
|
||||
let keys = get_range(
|
||||
s3,
|
||||
params.work_bucket.clone(),
|
||||
params.job_file.to_owned(),
|
||||
params.start..params.end,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let zchunks = fetch(&keys)
|
||||
.try_fold(
|
||||
Box::new(zstd::Encoder::new(vec![], zstd::DEFAULT_COMPRESSION_LEVEL).unwrap()),
|
||||
|mut w, buf| {
|
||||
w.write_all(&buf).unwrap();
|
||||
async { Ok(w) }
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let zchunks = to_byte_stream(zchunks.finish().unwrap());
|
||||
|
||||
tracing::info!("we got to put_object");
|
||||
|
||||
s3.put_object(PutObjectRequest {
|
||||
bucket: params.work_bucket,
|
||||
key: format!("narinfo.zst/{:016x}-{:016x}", params.start, params.end),
|
||||
body: Some(zchunks),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
||||
|
||||
tracing::info!("… and it worked!");
|
||||
|
||||
Ok("OK")
|
||||
}
|
||||
|
||||
fn to_byte_stream(buffer: Vec<u8>) -> ByteStream {
|
||||
let size_hint = buffer.len();
|
||||
ByteStream::new_with_size(
|
||||
futures::stream::once(async { Ok(buffer.into()) }),
|
||||
size_hint,
|
||||
)
|
||||
}
|
||||
2511
contrib/weave/Cargo.lock
generated
Normal file
2511
contrib/weave/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
9641
contrib/weave/Cargo.nix
Normal file
9641
contrib/weave/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
23
contrib/weave/Cargo.toml
Normal file
23
contrib/weave/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "weave"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
# TODO(edef): cut down on required features, this is kind of a grab bag right now
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.79", features = ["backtrace"] }
|
||||
hashbrown = "0.14.3"
|
||||
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
|
||||
safer_owning_ref = "0.5.0"
|
||||
rayon = "1.8.1"
|
||||
rustc-hash = "2.0.0"
|
||||
snix-tracing = { version = "0.1.0", path = "../../snix/tracing" }
|
||||
tracing = "0.1.40"
|
||||
tracing-indicatif = "0.3.6"
|
||||
|
||||
[dependencies.polars]
|
||||
version = "0.36.2"
|
||||
features = ["parquet", "lazy", "streaming"]
|
||||
1
contrib/weave/OWNERS
Normal file
1
contrib/weave/OWNERS
Normal file
|
|
@ -0,0 +1 @@
|
|||
edef
|
||||
11
contrib/weave/default.nix
Normal file
11
contrib/weave/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{ pkgs, depot, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
weave = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
118
contrib/weave/src/bin/swizzle.rs
Normal file
118
contrib/weave/src/bin/swizzle.rs
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
//! Swizzle reads a `narinfo.parquet` file, usually produced by `narinfo2parquet`.
|
||||
//!
|
||||
//! It swizzles the reference list, ie it converts the references from absolute,
|
||||
//! global identifiers (store path hashes) to indices into the `store_path_hash`
|
||||
//! column (ie, row numbers), so that we can later walk the reference graph
|
||||
//! efficiently.
|
||||
//!
|
||||
//! Path hashes are represented as non-null, 20-byte `Binary` values.
|
||||
//! The indices are represented as 32-bit unsigned integers, with in-band nulls
|
||||
//! represented by [INDEX_NULL] (the all-1 bit pattern), to permit swizzling
|
||||
//! partial datasets.
|
||||
//!
|
||||
//! In essence, it converts from names to pointers, so that `weave` can simply
|
||||
//! chase pointers to trace the live set. This replaces an `O(log(n))` lookup
|
||||
//! with `O(1)` indexing, and produces a much denser representation that actually
|
||||
//! fits in memory.
|
||||
//!
|
||||
//! The in-memory representation is at least 80% smaller, and the indices compress
|
||||
//! well in Parquet due to both temporal locality of reference and the power law
|
||||
//! distribution of reference "popularity".
|
||||
//!
|
||||
//! Only two columns are read from `narinfo.parquet`:
|
||||
//!
|
||||
//! * `store_path_hash :: PathHash`
|
||||
//! * `references :: List[PathHash]`
|
||||
//!
|
||||
//! Output is written to `narinfo-references.parquet` in the form of a single
|
||||
//! `List[u32]` column, `reference_idxs`.
|
||||
//!
|
||||
//! This file is inherently bound to the corresponding `narinfo.parquet`,
|
||||
//! since it essentially contains pointers into this file.
|
||||
|
||||
use anyhow::Result;
|
||||
use hashbrown::HashTable;
|
||||
use polars::{
|
||||
lazy::dsl::{col, SpecialEq},
|
||||
prelude::*,
|
||||
};
|
||||
use tracing::info_span;
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
|
||||
|
||||
use weave::{as_fixed_binary, hash64, leak, load_ph_array, INDEX_NULL};
|
||||
|
||||
#[tracing::instrument]
|
||||
fn main() -> Result<()> {
|
||||
let _tracing = snix_tracing::TracingBuilder::default()
|
||||
.enable_progressbar()
|
||||
.build()?;
|
||||
|
||||
let ph_array: &'static [[u8; 20]] = leak(load_ph_array()?);
|
||||
|
||||
// TODO(edef): re-parallelise this
|
||||
// We originally parallelised on chunks, but ph_array is only a single chunk, due to how Parquet loading works.
|
||||
// TODO(edef): outline the 64-bit hash prefix? it's an indirection, but it saves ~2G of memory
|
||||
let ph_map: &'static HashTable<(u64, u32)> = {
|
||||
let span = info_span!("ph_map", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("build index");
|
||||
span.pb_start();
|
||||
|
||||
let mut ph_map = HashTable::with_capacity(ph_array.len());
|
||||
|
||||
for (offset, item) in ph_array.iter().enumerate() {
|
||||
let offset = offset as u32;
|
||||
let hash = hash64(item);
|
||||
ph_map.insert_unique(hash, (hash, offset), |&(hash, _)| hash);
|
||||
}
|
||||
|
||||
&*Box::leak(Box::new(ph_map))
|
||||
};
|
||||
|
||||
let ph_to_idx = |key: &[u8; 20]| -> u32 {
|
||||
let hash = hash64(key);
|
||||
ph_map
|
||||
.find(hash, |&(candidate_hash, candidate_index)| {
|
||||
candidate_hash == hash && &ph_array[candidate_index as usize] == key
|
||||
})
|
||||
.map(|&(_, index)| index)
|
||||
.unwrap_or(INDEX_NULL)
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("swizzle_refs", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("swizzle references");
|
||||
span.pb_start();
|
||||
|
||||
LazyFrame::scan_parquet("narinfo.parquet", ScanArgsParquet::default())?
|
||||
.with_column(
|
||||
col("references")
|
||||
.map(
|
||||
move |series: Series| -> PolarsResult<Option<Series>> {
|
||||
Ok(Some(
|
||||
series
|
||||
.list()?
|
||||
.apply_to_inner(&|series: Series| -> PolarsResult<Series> {
|
||||
let series = series.binary()?;
|
||||
let mut out: Vec<u32> = Vec::with_capacity(series.len());
|
||||
out.extend(
|
||||
as_fixed_binary(series).flatten().map(ph_to_idx),
|
||||
);
|
||||
Ok(Series::from_vec("reference_idxs", out))
|
||||
})?
|
||||
.into_series(),
|
||||
))
|
||||
},
|
||||
SpecialEq::from_type(DataType::List(DataType::UInt32.into())),
|
||||
)
|
||||
.alias("reference_idxs"),
|
||||
)
|
||||
.select([col("reference_idxs")])
|
||||
.with_streaming(true)
|
||||
.sink_parquet(
|
||||
"narinfo-references.parquet".into(),
|
||||
ParquetWriteOptions::default(),
|
||||
)?;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
133
contrib/weave/src/lib.rs
Normal file
133
contrib/weave/src/lib.rs
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
use anyhow::Result;
|
||||
use owning_ref::{ArcRef, OwningRef};
|
||||
use rayon::prelude::*;
|
||||
use std::{
|
||||
fs::File,
|
||||
mem,
|
||||
ops::{Deref, Range},
|
||||
slice,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
|
||||
|
||||
use polars::{
|
||||
datatypes::BinaryChunked,
|
||||
export::arrow::array::BinaryArray,
|
||||
prelude::{ParquetReader, SerReader},
|
||||
};
|
||||
|
||||
/// An shared `[[u8; N]]` backed by a Polars [Buffer].
|
||||
pub type FixedBytes<const N: usize> =
|
||||
ArcRef<'static, polars::export::arrow::buffer::Bytes<u8>, [[u8; N]]>;
|
||||
|
||||
pub const INDEX_NULL: u32 = !0;
|
||||
|
||||
/// A terrific hash function, turning 20 bytes of cryptographic hash
|
||||
/// into 8 bytes of cryptographic hash.
|
||||
pub fn hash64(h: &[u8; 20]) -> u64 {
|
||||
let mut buf = [0; 8];
|
||||
buf.copy_from_slice(&h[..8]);
|
||||
u64::from_ne_bytes(buf)
|
||||
}
|
||||
|
||||
pub fn leak<O, T: ?Sized>(r: OwningRef<Arc<O>, T>) -> &T {
|
||||
// SAFETY: Either `ptr` points into the `Arc`, which lives until `r` is dropped,
|
||||
// or it points at something else entirely which lives at least as long.
|
||||
unsafe {
|
||||
let ptr: *const T = r.deref();
|
||||
mem::forget(r);
|
||||
&*ptr
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a dense `store_path_hash` array from `narinfo.parquet`,
|
||||
/// returning it as an owned [FixedBytes].
|
||||
#[tracing::instrument(fields(indicatif.pb_show = tracing::field::Empty))]
|
||||
pub fn load_ph_array() -> Result<FixedBytes<20>> {
|
||||
let span = tracing::Span::current();
|
||||
|
||||
span.pb_set_message("load store_path_hash");
|
||||
span.pb_start();
|
||||
|
||||
// TODO(edef): this could use a further pushdown, since polars is more hindrance than help here
|
||||
// We know this has to fit in memory (we can't mmap it without further encoding constraints),
|
||||
// and we want a single `Vec<[u8; 20]>` of the data.
|
||||
let ph_array = into_fixed_binary_rechunk::<20>(
|
||||
ParquetReader::new(File::open("narinfo.parquet").unwrap())
|
||||
.with_columns(Some(vec!["store_path_hash".into()]))
|
||||
.set_rechunk(true)
|
||||
.finish()?
|
||||
.column("store_path_hash")?
|
||||
.binary()?,
|
||||
);
|
||||
|
||||
u32::try_from(ph_array.len()).expect("dataset exceeds 2^32");
|
||||
|
||||
Ok(ph_array)
|
||||
}
|
||||
|
||||
/// Iterator over `&[[u8; N]]` from a dense [BinaryChunked].
|
||||
pub fn as_fixed_binary<const N: usize>(
|
||||
chunked: &BinaryChunked,
|
||||
) -> impl DoubleEndedIterator<Item = &[[u8; N]]> {
|
||||
chunked.downcast_iter().map(|array| {
|
||||
let range = assert_fixed_dense::<N>(array);
|
||||
exact_chunks(&array.values()[range]).unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert a dense [BinaryChunked] into a single chunk as [FixedBytes],
|
||||
/// without taking a reference to the offsets array and validity bitmap.
|
||||
fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBytes<N> {
|
||||
let chunked = chunked.rechunk();
|
||||
let mut iter = chunked.downcast_iter();
|
||||
let array = iter.next().unwrap();
|
||||
assert!(iter.next().is_none());
|
||||
|
||||
let (buf, off, len) = {
|
||||
let range = assert_fixed_dense::<N>(array);
|
||||
array.values().clone().sliced(range.start, range.len())
|
||||
}
|
||||
.into_inner();
|
||||
|
||||
ArcRef::new(buf).map(|bytes| exact_chunks(&bytes[off..off + len]).unwrap())
|
||||
}
|
||||
|
||||
/// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
|
||||
/// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
|
||||
#[must_use = "only the range returned is guaranteed to be conformant"]
|
||||
fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
|
||||
let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
|
||||
if null_count > 0 {
|
||||
panic!("null values present");
|
||||
}
|
||||
|
||||
let offsets = array.offsets();
|
||||
let length_check = offsets
|
||||
.as_slice()
|
||||
.par_windows(2)
|
||||
.all(|w| (w[1] - w[0]) == N as i64);
|
||||
|
||||
if !length_check {
|
||||
panic!("lengths are inconsistent");
|
||||
}
|
||||
|
||||
(*offsets.first() as usize)..(*offsets.last() as usize)
|
||||
}
|
||||
|
||||
fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {
|
||||
// SAFETY: We ensure that `buf.len()` is a multiple of K, and there are no alignment requirements.
|
||||
unsafe {
|
||||
let ptr = buf.as_ptr();
|
||||
let len = buf.len();
|
||||
|
||||
if len % K != 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = ptr as *mut [u8; K];
|
||||
let len = len / K;
|
||||
|
||||
Some(slice::from_raw_parts(ptr, len))
|
||||
}
|
||||
}
|
||||
262
contrib/weave/src/main.rs
Normal file
262
contrib/weave/src/main.rs
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
|
||||
//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
|
||||
//! produced by `swizzle` to collect the closure of the roots.
|
||||
//!
|
||||
//! They are written to `live_idxs.parquet`, which only has one column, representing
|
||||
//! the row numbers in `narinfo.parquet` corresponding to live paths.
|
||||
|
||||
use anyhow::Result;
|
||||
use hashbrown::{hash_table, HashTable};
|
||||
use rayon::prelude::*;
|
||||
use rustc_hash::FxHashSet;
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs::File,
|
||||
ops::Index,
|
||||
sync::atomic::{AtomicU32, Ordering},
|
||||
};
|
||||
use tracing::{info_span, warn};
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt;
|
||||
|
||||
use polars::{
|
||||
datatypes::StaticArray,
|
||||
export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
|
||||
lazy::dsl::col,
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
use weave::{as_fixed_binary, hash64, INDEX_NULL};
|
||||
|
||||
#[tracing::instrument]
|
||||
fn main() -> Result<()> {
|
||||
let _tracing = snix_tracing::TracingBuilder::default()
|
||||
.enable_progressbar()
|
||||
.build()?;
|
||||
|
||||
let roots: PathSet32 = {
|
||||
let span = info_span!("parse_roots", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("parse roots");
|
||||
span.pb_start();
|
||||
|
||||
as_fixed_binary::<20>(
|
||||
LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
|
||||
.explode([col("store_path_hash")])
|
||||
.select([col("store_path_hash")])
|
||||
.collect()?
|
||||
.column("store_path_hash")?
|
||||
.binary()?,
|
||||
)
|
||||
.flatten()
|
||||
.collect()
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("resolve_roots", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("resolve roots");
|
||||
span.pb_start();
|
||||
|
||||
weave::load_ph_array()?
|
||||
.into_par_iter()
|
||||
.enumerate()
|
||||
.for_each(|(idx, h)| {
|
||||
if let Some(idx_slot) = roots.find(h) {
|
||||
assert_eq!(
|
||||
idx_slot.swap(idx as u32, Ordering::Relaxed),
|
||||
INDEX_NULL,
|
||||
"duplicate entry"
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let mut todo = FxHashSet::default();
|
||||
todo.reserve(roots.len());
|
||||
{
|
||||
let mut unknown_roots = 0usize;
|
||||
for (_, idx) in roots.table {
|
||||
let idx = idx.into_inner();
|
||||
if idx == INDEX_NULL {
|
||||
unknown_roots += 1;
|
||||
continue;
|
||||
}
|
||||
todo.insert(idx);
|
||||
}
|
||||
|
||||
if unknown_roots != 0 {
|
||||
warn!("skipping {unknown_roots} unknown roots");
|
||||
}
|
||||
}
|
||||
|
||||
let ri_array;
|
||||
let ri_array = {
|
||||
let span = info_span!(
|
||||
"load_reference_idxs",
|
||||
indicatif.pb_show = tracing::field::Empty
|
||||
)
|
||||
.entered();
|
||||
span.pb_set_message("load reference_idxs");
|
||||
span.pb_start();
|
||||
|
||||
ri_array = ParquetReader::new(File::open("narinfo-references.parquet")?)
|
||||
.finish()?
|
||||
.column("reference_idxs")?
|
||||
.list()?
|
||||
.clone();
|
||||
|
||||
ChunkedList::new(ri_array.downcast_iter().map(|chunk| {
|
||||
(
|
||||
chunk.offsets(),
|
||||
chunk
|
||||
.values()
|
||||
.as_any()
|
||||
.downcast_ref::<UInt32Array>()
|
||||
.unwrap()
|
||||
.as_slice()
|
||||
.unwrap(),
|
||||
)
|
||||
}))
|
||||
};
|
||||
|
||||
let mut seen = todo.clone();
|
||||
{
|
||||
let span = info_span!("mark", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("marking");
|
||||
span.pb_set_style(&snix_tracing::PB_PROGRESS_STYLE);
|
||||
|
||||
while !todo.is_empty() {
|
||||
span.pb_set_length(seen.len() as u64);
|
||||
span.pb_set_position(seen.len().saturating_sub(todo.len()) as u64);
|
||||
|
||||
todo = todo
|
||||
.par_iter()
|
||||
.flat_map(|&parent| {
|
||||
if parent == INDEX_NULL {
|
||||
return FxHashSet::default();
|
||||
}
|
||||
|
||||
ri_array[parent as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.filter(|child| !seen.contains(child))
|
||||
.collect::<FxHashSet<u32>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
for &index in &todo {
|
||||
seen.insert(index);
|
||||
}
|
||||
}
|
||||
|
||||
span.pb_set_length(seen.len() as u64);
|
||||
span.pb_set_position(seen.len() as u64);
|
||||
|
||||
if seen.remove(&INDEX_NULL) {
|
||||
warn!("WARNING: missing edges");
|
||||
}
|
||||
}
|
||||
|
||||
let seen = {
|
||||
let span = info_span!("gather_live", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("gathering live set");
|
||||
|
||||
let mut seen: Vec<u32> = seen.into_iter().collect();
|
||||
seen.par_sort();
|
||||
seen
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("write_output", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("writing output");
|
||||
span.pb_start();
|
||||
|
||||
ParquetWriter::new(File::create("live_idxs.parquet")?).finish(&mut df! {
|
||||
"live_idx" => seen,
|
||||
}?)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PathSet32 {
|
||||
table: HashTable<([u8; 20], AtomicU32)>,
|
||||
}
|
||||
|
||||
impl PathSet32 {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
table: HashTable::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
fn insert(&mut self, value: &[u8; 20]) -> bool {
|
||||
let hash = hash64(value);
|
||||
|
||||
match self
|
||||
.table
|
||||
.entry(hash, |(x, _)| x == value, |(x, _)| hash64(x))
|
||||
{
|
||||
hash_table::Entry::Occupied(_) => false,
|
||||
hash_table::Entry::Vacant(entry) => {
|
||||
entry.insert((*value, AtomicU32::new(INDEX_NULL)));
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn find(&self, value: &[u8; 20]) -> Option<&AtomicU32> {
|
||||
let hash = hash64(value);
|
||||
self.table
|
||||
.find(hash, |(x, _)| x == value)
|
||||
.as_ref()
|
||||
.map(|(_, x)| x)
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.table.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
|
||||
fn from_iter<T: IntoIterator<Item = &'a [u8; 20]>>(iter: T) -> Self {
|
||||
let iter = iter.into_iter();
|
||||
let mut this = Self::with_capacity(iter.size_hint().0);
|
||||
|
||||
for item in iter {
|
||||
this.insert(item);
|
||||
}
|
||||
|
||||
this.table.shrink_to_fit(|(x, _)| hash64(x));
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkedList<'a, T> {
|
||||
by_offset: BTreeMap<usize, (&'a OffsetsBuffer<i64>, &'a [T])>,
|
||||
}
|
||||
|
||||
impl<'a, T> ChunkedList<'a, T> {
|
||||
fn new(chunks: impl IntoIterator<Item = (&'a OffsetsBuffer<i64>, &'a [T])>) -> Self {
|
||||
let mut next_offset = 0usize;
|
||||
ChunkedList {
|
||||
by_offset: chunks
|
||||
.into_iter()
|
||||
.map(|(offsets, values)| {
|
||||
let offset = next_offset;
|
||||
next_offset = next_offset.checked_add(offsets.len_proxy()).unwrap();
|
||||
|
||||
(offset, (offsets, values))
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Index<usize> for ChunkedList<'a, T> {
|
||||
type Output = [T];
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
let (&base, &(offsets, values)) = self.by_offset.range(..=index).next_back().unwrap();
|
||||
let (start, end) = offsets.start_end(index - base);
|
||||
&values[start..end]
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue