chore(users/edef): move to contrib

Change-Id: I1a6972fab8ada26917f29607fc401e376d634070
This commit is contained in:
Florian Klink 2025-03-17 12:41:31 +00:00
parent a7916624dc
commit 403d8fc897
55 changed files with 15 additions and 17 deletions

2511
contrib/weave/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

9641
contrib/weave/Cargo.nix Normal file

File diff suppressed because it is too large Load diff

23
contrib/weave/Cargo.toml Normal file
View file

@ -0,0 +1,23 @@
[package]
name = "weave"
version = "0.1.0"
edition = "2021"
[workspace]
members = ["."]
# TODO(edef): cut down on required features, this is kind of a grab bag right now
[dependencies]
anyhow = { version = "1.0.79", features = ["backtrace"] }
hashbrown = "0.14.3"
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
safer_owning_ref = "0.5.0"
rayon = "1.8.1"
rustc-hash = "2.0.0"
snix-tracing = { version = "0.1.0", path = "../../snix/tracing" }
tracing = "0.1.40"
tracing-indicatif = "0.3.6"
[dependencies.polars]
version = "0.36.2"
features = ["parquet", "lazy", "streaming"]

1
contrib/weave/OWNERS Normal file
View file

@ -0,0 +1 @@
edef

11
contrib/weave/default.nix Normal file
View file

@ -0,0 +1,11 @@
{ pkgs, depot, ... }:
(pkgs.callPackage ./Cargo.nix {
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
weave = prev: {
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
};
};
}).rootCrate.build.overrideAttrs {
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
}

View file

@ -0,0 +1,118 @@
//! Swizzle reads a `narinfo.parquet` file, usually produced by `narinfo2parquet`.
//!
//! It swizzles the reference list, ie it converts the references from absolute,
//! global identifiers (store path hashes) to indices into the `store_path_hash`
//! column (ie, row numbers), so that we can later walk the reference graph
//! efficiently.
//!
//! Path hashes are represented as non-null, 20-byte `Binary` values.
//! The indices are represented as 32-bit unsigned integers, with in-band nulls
//! represented by [INDEX_NULL] (the all-1 bit pattern), to permit swizzling
//! partial datasets.
//!
//! In essence, it converts from names to pointers, so that `weave` can simply
//! chase pointers to trace the live set. This replaces an `O(log(n))` lookup
//! with `O(1)` indexing, and produces a much denser representation that actually
//! fits in memory.
//!
//! The in-memory representation is at least 80% smaller, and the indices compress
//! well in Parquet due to both temporal locality of reference and the power law
//! distribution of reference "popularity".
//!
//! Only two columns are read from `narinfo.parquet`:
//!
//! * `store_path_hash :: PathHash`
//! * `references :: List[PathHash]`
//!
//! Output is written to `narinfo-references.parquet` in the form of a single
//! `List[u32]` column, `reference_idxs`.
//!
//! This file is inherently bound to the corresponding `narinfo.parquet`,
//! since it essentially contains pointers into this file.
use anyhow::Result;
use hashbrown::HashTable;
use polars::{
lazy::dsl::{col, SpecialEq},
prelude::*,
};
use tracing::info_span;
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
use weave::{as_fixed_binary, hash64, leak, load_ph_array, INDEX_NULL};
#[tracing::instrument]
fn main() -> Result<()> {
let _tracing = snix_tracing::TracingBuilder::default()
.enable_progressbar()
.build()?;
let ph_array: &'static [[u8; 20]] = leak(load_ph_array()?);
// TODO(edef): re-parallelise this
// We originally parallelised on chunks, but ph_array is only a single chunk, due to how Parquet loading works.
// TODO(edef): outline the 64-bit hash prefix? it's an indirection, but it saves ~2G of memory
let ph_map: &'static HashTable<(u64, u32)> = {
let span = info_span!("ph_map", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("build index");
span.pb_start();
let mut ph_map = HashTable::with_capacity(ph_array.len());
for (offset, item) in ph_array.iter().enumerate() {
let offset = offset as u32;
let hash = hash64(item);
ph_map.insert_unique(hash, (hash, offset), |&(hash, _)| hash);
}
&*Box::leak(Box::new(ph_map))
};
let ph_to_idx = |key: &[u8; 20]| -> u32 {
let hash = hash64(key);
ph_map
.find(hash, |&(candidate_hash, candidate_index)| {
candidate_hash == hash && &ph_array[candidate_index as usize] == key
})
.map(|&(_, index)| index)
.unwrap_or(INDEX_NULL)
};
{
let span = info_span!("swizzle_refs", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("swizzle references");
span.pb_start();
LazyFrame::scan_parquet("narinfo.parquet", ScanArgsParquet::default())?
.with_column(
col("references")
.map(
move |series: Series| -> PolarsResult<Option<Series>> {
Ok(Some(
series
.list()?
.apply_to_inner(&|series: Series| -> PolarsResult<Series> {
let series = series.binary()?;
let mut out: Vec<u32> = Vec::with_capacity(series.len());
out.extend(
as_fixed_binary(series).flatten().map(ph_to_idx),
);
Ok(Series::from_vec("reference_idxs", out))
})?
.into_series(),
))
},
SpecialEq::from_type(DataType::List(DataType::UInt32.into())),
)
.alias("reference_idxs"),
)
.select([col("reference_idxs")])
.with_streaming(true)
.sink_parquet(
"narinfo-references.parquet".into(),
ParquetWriteOptions::default(),
)?;
};
Ok(())
}

133
contrib/weave/src/lib.rs Normal file
View file

@ -0,0 +1,133 @@
use anyhow::Result;
use owning_ref::{ArcRef, OwningRef};
use rayon::prelude::*;
use std::{
fs::File,
mem,
ops::{Deref, Range},
slice,
sync::Arc,
};
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
use polars::{
datatypes::BinaryChunked,
export::arrow::array::BinaryArray,
prelude::{ParquetReader, SerReader},
};
/// An shared `[[u8; N]]` backed by a Polars [Buffer].
pub type FixedBytes<const N: usize> =
ArcRef<'static, polars::export::arrow::buffer::Bytes<u8>, [[u8; N]]>;
pub const INDEX_NULL: u32 = !0;
/// A terrific hash function, turning 20 bytes of cryptographic hash
/// into 8 bytes of cryptographic hash.
pub fn hash64(h: &[u8; 20]) -> u64 {
let mut buf = [0; 8];
buf.copy_from_slice(&h[..8]);
u64::from_ne_bytes(buf)
}
pub fn leak<O, T: ?Sized>(r: OwningRef<Arc<O>, T>) -> &T {
// SAFETY: Either `ptr` points into the `Arc`, which lives until `r` is dropped,
// or it points at something else entirely which lives at least as long.
unsafe {
let ptr: *const T = r.deref();
mem::forget(r);
&*ptr
}
}
/// Read a dense `store_path_hash` array from `narinfo.parquet`,
/// returning it as an owned [FixedBytes].
#[tracing::instrument(fields(indicatif.pb_show = tracing::field::Empty))]
pub fn load_ph_array() -> Result<FixedBytes<20>> {
let span = tracing::Span::current();
span.pb_set_message("load store_path_hash");
span.pb_start();
// TODO(edef): this could use a further pushdown, since polars is more hindrance than help here
// We know this has to fit in memory (we can't mmap it without further encoding constraints),
// and we want a single `Vec<[u8; 20]>` of the data.
let ph_array = into_fixed_binary_rechunk::<20>(
ParquetReader::new(File::open("narinfo.parquet").unwrap())
.with_columns(Some(vec!["store_path_hash".into()]))
.set_rechunk(true)
.finish()?
.column("store_path_hash")?
.binary()?,
);
u32::try_from(ph_array.len()).expect("dataset exceeds 2^32");
Ok(ph_array)
}
/// Iterator over `&[[u8; N]]` from a dense [BinaryChunked].
pub fn as_fixed_binary<const N: usize>(
chunked: &BinaryChunked,
) -> impl DoubleEndedIterator<Item = &[[u8; N]]> {
chunked.downcast_iter().map(|array| {
let range = assert_fixed_dense::<N>(array);
exact_chunks(&array.values()[range]).unwrap()
})
}
/// Convert a dense [BinaryChunked] into a single chunk as [FixedBytes],
/// without taking a reference to the offsets array and validity bitmap.
fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBytes<N> {
let chunked = chunked.rechunk();
let mut iter = chunked.downcast_iter();
let array = iter.next().unwrap();
assert!(iter.next().is_none());
let (buf, off, len) = {
let range = assert_fixed_dense::<N>(array);
array.values().clone().sliced(range.start, range.len())
}
.into_inner();
ArcRef::new(buf).map(|bytes| exact_chunks(&bytes[off..off + len]).unwrap())
}
/// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
/// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
#[must_use = "only the range returned is guaranteed to be conformant"]
fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
if null_count > 0 {
panic!("null values present");
}
let offsets = array.offsets();
let length_check = offsets
.as_slice()
.par_windows(2)
.all(|w| (w[1] - w[0]) == N as i64);
if !length_check {
panic!("lengths are inconsistent");
}
(*offsets.first() as usize)..(*offsets.last() as usize)
}
fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {
// SAFETY: We ensure that `buf.len()` is a multiple of K, and there are no alignment requirements.
unsafe {
let ptr = buf.as_ptr();
let len = buf.len();
if len % K != 0 {
return None;
}
let ptr = ptr as *mut [u8; K];
let len = len / K;
Some(slice::from_raw_parts(ptr, len))
}
}

262
contrib/weave/src/main.rs Normal file
View file

@ -0,0 +1,262 @@
//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
//! produced by `swizzle` to collect the closure of the roots.
//!
//! They are written to `live_idxs.parquet`, which only has one column, representing
//! the row numbers in `narinfo.parquet` corresponding to live paths.
use anyhow::Result;
use hashbrown::{hash_table, HashTable};
use rayon::prelude::*;
use rustc_hash::FxHashSet;
use std::{
collections::BTreeMap,
fs::File,
ops::Index,
sync::atomic::{AtomicU32, Ordering},
};
use tracing::{info_span, warn};
use tracing_indicatif::span_ext::IndicatifSpanExt;
use polars::{
datatypes::StaticArray,
export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
lazy::dsl::col,
prelude::*,
};
use weave::{as_fixed_binary, hash64, INDEX_NULL};
#[tracing::instrument]
fn main() -> Result<()> {
let _tracing = snix_tracing::TracingBuilder::default()
.enable_progressbar()
.build()?;
let roots: PathSet32 = {
let span = info_span!("parse_roots", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("parse roots");
span.pb_start();
as_fixed_binary::<20>(
LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
.explode([col("store_path_hash")])
.select([col("store_path_hash")])
.collect()?
.column("store_path_hash")?
.binary()?,
)
.flatten()
.collect()
};
{
let span = info_span!("resolve_roots", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("resolve roots");
span.pb_start();
weave::load_ph_array()?
.into_par_iter()
.enumerate()
.for_each(|(idx, h)| {
if let Some(idx_slot) = roots.find(h) {
assert_eq!(
idx_slot.swap(idx as u32, Ordering::Relaxed),
INDEX_NULL,
"duplicate entry"
);
}
});
}
let mut todo = FxHashSet::default();
todo.reserve(roots.len());
{
let mut unknown_roots = 0usize;
for (_, idx) in roots.table {
let idx = idx.into_inner();
if idx == INDEX_NULL {
unknown_roots += 1;
continue;
}
todo.insert(idx);
}
if unknown_roots != 0 {
warn!("skipping {unknown_roots} unknown roots");
}
}
let ri_array;
let ri_array = {
let span = info_span!(
"load_reference_idxs",
indicatif.pb_show = tracing::field::Empty
)
.entered();
span.pb_set_message("load reference_idxs");
span.pb_start();
ri_array = ParquetReader::new(File::open("narinfo-references.parquet")?)
.finish()?
.column("reference_idxs")?
.list()?
.clone();
ChunkedList::new(ri_array.downcast_iter().map(|chunk| {
(
chunk.offsets(),
chunk
.values()
.as_any()
.downcast_ref::<UInt32Array>()
.unwrap()
.as_slice()
.unwrap(),
)
}))
};
let mut seen = todo.clone();
{
let span = info_span!("mark", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("marking");
span.pb_set_style(&snix_tracing::PB_PROGRESS_STYLE);
while !todo.is_empty() {
span.pb_set_length(seen.len() as u64);
span.pb_set_position(seen.len().saturating_sub(todo.len()) as u64);
todo = todo
.par_iter()
.flat_map(|&parent| {
if parent == INDEX_NULL {
return FxHashSet::default();
}
ri_array[parent as usize]
.iter()
.cloned()
.filter(|child| !seen.contains(child))
.collect::<FxHashSet<u32>>()
})
.collect();
for &index in &todo {
seen.insert(index);
}
}
span.pb_set_length(seen.len() as u64);
span.pb_set_position(seen.len() as u64);
if seen.remove(&INDEX_NULL) {
warn!("WARNING: missing edges");
}
}
let seen = {
let span = info_span!("gather_live", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("gathering live set");
let mut seen: Vec<u32> = seen.into_iter().collect();
seen.par_sort();
seen
};
{
let span = info_span!("write_output", indicatif.pb_show = tracing::field::Empty).entered();
span.pb_set_message("writing output");
span.pb_start();
ParquetWriter::new(File::create("live_idxs.parquet")?).finish(&mut df! {
"live_idx" => seen,
}?)?;
}
Ok(())
}
struct PathSet32 {
table: HashTable<([u8; 20], AtomicU32)>,
}
impl PathSet32 {
fn with_capacity(capacity: usize) -> Self {
Self {
table: HashTable::with_capacity(capacity),
}
}
fn insert(&mut self, value: &[u8; 20]) -> bool {
let hash = hash64(value);
match self
.table
.entry(hash, |(x, _)| x == value, |(x, _)| hash64(x))
{
hash_table::Entry::Occupied(_) => false,
hash_table::Entry::Vacant(entry) => {
entry.insert((*value, AtomicU32::new(INDEX_NULL)));
true
}
}
}
fn find(&self, value: &[u8; 20]) -> Option<&AtomicU32> {
let hash = hash64(value);
self.table
.find(hash, |(x, _)| x == value)
.as_ref()
.map(|(_, x)| x)
}
fn len(&self) -> usize {
self.table.len()
}
}
impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
fn from_iter<T: IntoIterator<Item = &'a [u8; 20]>>(iter: T) -> Self {
let iter = iter.into_iter();
let mut this = Self::with_capacity(iter.size_hint().0);
for item in iter {
this.insert(item);
}
this.table.shrink_to_fit(|(x, _)| hash64(x));
this
}
}
struct ChunkedList<'a, T> {
by_offset: BTreeMap<usize, (&'a OffsetsBuffer<i64>, &'a [T])>,
}
impl<'a, T> ChunkedList<'a, T> {
fn new(chunks: impl IntoIterator<Item = (&'a OffsetsBuffer<i64>, &'a [T])>) -> Self {
let mut next_offset = 0usize;
ChunkedList {
by_offset: chunks
.into_iter()
.map(|(offsets, values)| {
let offset = next_offset;
next_offset = next_offset.checked_add(offsets.len_proxy()).unwrap();
(offset, (offsets, values))
})
.collect(),
}
}
}
impl<'a, T> Index<usize> for ChunkedList<'a, T> {
type Output = [T];
fn index(&self, index: usize) -> &Self::Output {
let (&base, &(offsets, values)) = self.by_offset.range(..=index).next_back().unwrap();
let (start, end) = offsets.start_end(index - base);
&values[start..end]
}
}