feat(tvix/tools/crunch-v2): init
This is a tool for ingesting subsets of cache.nixos.org into its own flattened castore format. Currently, produced chunks are not preserved, and this purely serves as a way of measuring compression/deduplication ratios for various chunking and compression parameters. Change-Id: I3983af02a66f7837d76874ee0fc8b2fab62ac17e Reviewed-on: https://cl.tvl.fyi/c/depot/+/10486 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
parent
e0a1c03b24
commit
4f22203a3a
12 changed files with 15022 additions and 0 deletions
139
tvix/tools/crunch-v2/src/bin/extract.rs
Normal file
139
tvix/tools/crunch-v2/src/bin/extract.rs
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
//! This tool lossily converts a Sled database produced by crunch-v2 into a Parquet file for analysis.
|
||||
//! The resulting `crunch.parquet` has columns file_hash`, `nar_hash`, and `chunk`.
|
||||
//! The first two are SHA-256 hashes of the compressed file and the NAR it decompresses to.
|
||||
//! `chunk` is a struct array corresponding to [crunch_v2::proto::Chunk] messages.
|
||||
//! They are concatenated without any additional structure, so nothing but the chunk list is preserved.
|
||||
|
||||
use anyhow::Result;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use std::fs::File;
|
||||
|
||||
use crunch_v2::{
|
||||
proto::{self, path::Node},
|
||||
FILES,
|
||||
};
|
||||
use prost::Message;
|
||||
|
||||
use polars::{
|
||||
chunked_array::builder::AnonymousOwnedListBuilder,
|
||||
prelude::{
|
||||
df, BinaryChunkedBuilder, ChunkedBuilder, DataFrame, DataType, Field, ListBuilderTrait,
|
||||
NamedFrom, ParquetWriter, PrimitiveChunkedBuilder, Series, UInt32Type,
|
||||
},
|
||||
series::IntoSeries,
|
||||
};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let w = ParquetWriter::new(File::create("crunch.parquet")?);
|
||||
|
||||
let progress = ProgressBar::new(FILES.len() as u64).with_style(ProgressStyle::with_template(
|
||||
"{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
|
||||
)?);
|
||||
|
||||
let mut frame = FrameBuilder::new();
|
||||
for entry in &*FILES {
|
||||
let (file_hash, pb) = entry?;
|
||||
frame.push(
|
||||
file_hash[..].try_into().unwrap(),
|
||||
proto::Path::decode(&pb[..])?,
|
||||
);
|
||||
progress.inc(1);
|
||||
}
|
||||
|
||||
w.finish(&mut frame.finish())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct FrameBuilder {
|
||||
file_hash: BinaryChunkedBuilder,
|
||||
nar_hash: BinaryChunkedBuilder,
|
||||
chunk: AnonymousOwnedListBuilder,
|
||||
}
|
||||
|
||||
impl FrameBuilder {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
|
||||
nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
|
||||
chunk: AnonymousOwnedListBuilder::new(
|
||||
"chunk",
|
||||
0,
|
||||
Some(DataType::Struct(vec![
|
||||
Field::new("hash", DataType::Binary),
|
||||
Field::new("size", DataType::UInt32),
|
||||
Field::new("size_compressed", DataType::UInt32),
|
||||
])),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, file_hash: [u8; 32], pb: proto::Path) {
|
||||
self.file_hash.append_value(&file_hash[..]);
|
||||
self.nar_hash.append_value(pb.nar_hash);
|
||||
self.chunk
|
||||
.append_series(&ChunkFrameBuilder::new(pb.node.unwrap()))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn finish(mut self) -> DataFrame {
|
||||
df! {
|
||||
"file_hash" => self.file_hash.finish().into_series(),
|
||||
"nar_hash" => self.nar_hash.finish().into_series(),
|
||||
"chunk" => self.chunk.finish().into_series()
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkFrameBuilder {
|
||||
hash: BinaryChunkedBuilder,
|
||||
size: PrimitiveChunkedBuilder<UInt32Type>,
|
||||
size_compressed: PrimitiveChunkedBuilder<UInt32Type>,
|
||||
}
|
||||
|
||||
impl ChunkFrameBuilder {
|
||||
fn new(node: proto::path::Node) -> Series {
|
||||
let mut this = Self {
|
||||
hash: BinaryChunkedBuilder::new("hash", 0, 0),
|
||||
size: PrimitiveChunkedBuilder::new("size", 0),
|
||||
size_compressed: PrimitiveChunkedBuilder::new("size_compressed", 0),
|
||||
};
|
||||
|
||||
this.push(node);
|
||||
this.finish()
|
||||
}
|
||||
|
||||
fn push(&mut self, node: Node) {
|
||||
match node {
|
||||
Node::Directory(node) => {
|
||||
for node in node.files {
|
||||
self.push(Node::File(node));
|
||||
}
|
||||
|
||||
for node in node.directories {
|
||||
self.push(Node::Directory(node));
|
||||
}
|
||||
}
|
||||
Node::File(node) => {
|
||||
for chunk in node.chunks {
|
||||
self.hash.append_value(&chunk.hash);
|
||||
self.size.append_value(chunk.size);
|
||||
self.size_compressed.append_value(chunk.size_compressed);
|
||||
}
|
||||
}
|
||||
Node::Symlink(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn finish(self) -> Series {
|
||||
df! {
|
||||
"hash" => self.hash.finish().into_series(),
|
||||
"size" => self.size.finish().into_series(),
|
||||
"size_compressed" => self.size_compressed.finish().into_series()
|
||||
}
|
||||
.unwrap()
|
||||
.into_struct("chunk")
|
||||
.into_series()
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue