chore(users/edef): move to contrib
Change-Id: I1a6972fab8ada26917f29607fc401e376d634070
This commit is contained in:
parent
a7916624dc
commit
403d8fc897
55 changed files with 15 additions and 17 deletions
2511
contrib/weave/Cargo.lock
generated
Normal file
2511
contrib/weave/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
9641
contrib/weave/Cargo.nix
Normal file
9641
contrib/weave/Cargo.nix
Normal file
File diff suppressed because it is too large
Load diff
23
contrib/weave/Cargo.toml
Normal file
23
contrib/weave/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "weave"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[workspace]
|
||||
members = ["."]
|
||||
|
||||
# TODO(edef): cut down on required features, this is kind of a grab bag right now
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0.79", features = ["backtrace"] }
|
||||
hashbrown = "0.14.3"
|
||||
nix-compat = { version = "0.1.0", path = "../../snix/nix-compat" }
|
||||
safer_owning_ref = "0.5.0"
|
||||
rayon = "1.8.1"
|
||||
rustc-hash = "2.0.0"
|
||||
snix-tracing = { version = "0.1.0", path = "../../snix/tracing" }
|
||||
tracing = "0.1.40"
|
||||
tracing-indicatif = "0.3.6"
|
||||
|
||||
[dependencies.polars]
|
||||
version = "0.36.2"
|
||||
features = ["parquet", "lazy", "streaming"]
|
||||
1
contrib/weave/OWNERS
Normal file
1
contrib/weave/OWNERS
Normal file
|
|
@ -0,0 +1 @@
|
|||
edef
|
||||
11
contrib/weave/default.nix
Normal file
11
contrib/weave/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{ pkgs, depot, ... }:
|
||||
|
||||
(pkgs.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = (depot.snix.utils.defaultCrateOverridesForPkgs pkgs) // {
|
||||
weave = prev: {
|
||||
src = depot.snix.utils.filterRustCrateSrc { root = prev.src.origSrc; };
|
||||
};
|
||||
};
|
||||
}).rootCrate.build.overrideAttrs {
|
||||
meta.ci.extraSteps.crate2nix-check = depot.snix.utils.mkCrate2nixCheck ./Cargo.nix;
|
||||
}
|
||||
118
contrib/weave/src/bin/swizzle.rs
Normal file
118
contrib/weave/src/bin/swizzle.rs
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
//! Swizzle reads a `narinfo.parquet` file, usually produced by `narinfo2parquet`.
|
||||
//!
|
||||
//! It swizzles the reference list, ie it converts the references from absolute,
|
||||
//! global identifiers (store path hashes) to indices into the `store_path_hash`
|
||||
//! column (ie, row numbers), so that we can later walk the reference graph
|
||||
//! efficiently.
|
||||
//!
|
||||
//! Path hashes are represented as non-null, 20-byte `Binary` values.
|
||||
//! The indices are represented as 32-bit unsigned integers, with in-band nulls
|
||||
//! represented by [INDEX_NULL] (the all-1 bit pattern), to permit swizzling
|
||||
//! partial datasets.
|
||||
//!
|
||||
//! In essence, it converts from names to pointers, so that `weave` can simply
|
||||
//! chase pointers to trace the live set. This replaces an `O(log(n))` lookup
|
||||
//! with `O(1)` indexing, and produces a much denser representation that actually
|
||||
//! fits in memory.
|
||||
//!
|
||||
//! The in-memory representation is at least 80% smaller, and the indices compress
|
||||
//! well in Parquet due to both temporal locality of reference and the power law
|
||||
//! distribution of reference "popularity".
|
||||
//!
|
||||
//! Only two columns are read from `narinfo.parquet`:
|
||||
//!
|
||||
//! * `store_path_hash :: PathHash`
|
||||
//! * `references :: List[PathHash]`
|
||||
//!
|
||||
//! Output is written to `narinfo-references.parquet` in the form of a single
|
||||
//! `List[u32]` column, `reference_idxs`.
|
||||
//!
|
||||
//! This file is inherently bound to the corresponding `narinfo.parquet`,
|
||||
//! since it essentially contains pointers into this file.
|
||||
|
||||
use anyhow::Result;
|
||||
use hashbrown::HashTable;
|
||||
use polars::{
|
||||
lazy::dsl::{col, SpecialEq},
|
||||
prelude::*,
|
||||
};
|
||||
use tracing::info_span;
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
|
||||
|
||||
use weave::{as_fixed_binary, hash64, leak, load_ph_array, INDEX_NULL};
|
||||
|
||||
#[tracing::instrument]
|
||||
fn main() -> Result<()> {
|
||||
let _tracing = snix_tracing::TracingBuilder::default()
|
||||
.enable_progressbar()
|
||||
.build()?;
|
||||
|
||||
let ph_array: &'static [[u8; 20]] = leak(load_ph_array()?);
|
||||
|
||||
// TODO(edef): re-parallelise this
|
||||
// We originally parallelised on chunks, but ph_array is only a single chunk, due to how Parquet loading works.
|
||||
// TODO(edef): outline the 64-bit hash prefix? it's an indirection, but it saves ~2G of memory
|
||||
let ph_map: &'static HashTable<(u64, u32)> = {
|
||||
let span = info_span!("ph_map", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("build index");
|
||||
span.pb_start();
|
||||
|
||||
let mut ph_map = HashTable::with_capacity(ph_array.len());
|
||||
|
||||
for (offset, item) in ph_array.iter().enumerate() {
|
||||
let offset = offset as u32;
|
||||
let hash = hash64(item);
|
||||
ph_map.insert_unique(hash, (hash, offset), |&(hash, _)| hash);
|
||||
}
|
||||
|
||||
&*Box::leak(Box::new(ph_map))
|
||||
};
|
||||
|
||||
let ph_to_idx = |key: &[u8; 20]| -> u32 {
|
||||
let hash = hash64(key);
|
||||
ph_map
|
||||
.find(hash, |&(candidate_hash, candidate_index)| {
|
||||
candidate_hash == hash && &ph_array[candidate_index as usize] == key
|
||||
})
|
||||
.map(|&(_, index)| index)
|
||||
.unwrap_or(INDEX_NULL)
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("swizzle_refs", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("swizzle references");
|
||||
span.pb_start();
|
||||
|
||||
LazyFrame::scan_parquet("narinfo.parquet", ScanArgsParquet::default())?
|
||||
.with_column(
|
||||
col("references")
|
||||
.map(
|
||||
move |series: Series| -> PolarsResult<Option<Series>> {
|
||||
Ok(Some(
|
||||
series
|
||||
.list()?
|
||||
.apply_to_inner(&|series: Series| -> PolarsResult<Series> {
|
||||
let series = series.binary()?;
|
||||
let mut out: Vec<u32> = Vec::with_capacity(series.len());
|
||||
out.extend(
|
||||
as_fixed_binary(series).flatten().map(ph_to_idx),
|
||||
);
|
||||
Ok(Series::from_vec("reference_idxs", out))
|
||||
})?
|
||||
.into_series(),
|
||||
))
|
||||
},
|
||||
SpecialEq::from_type(DataType::List(DataType::UInt32.into())),
|
||||
)
|
||||
.alias("reference_idxs"),
|
||||
)
|
||||
.select([col("reference_idxs")])
|
||||
.with_streaming(true)
|
||||
.sink_parquet(
|
||||
"narinfo-references.parquet".into(),
|
||||
ParquetWriteOptions::default(),
|
||||
)?;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
133
contrib/weave/src/lib.rs
Normal file
133
contrib/weave/src/lib.rs
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
use anyhow::Result;
|
||||
use owning_ref::{ArcRef, OwningRef};
|
||||
use rayon::prelude::*;
|
||||
use std::{
|
||||
fs::File,
|
||||
mem,
|
||||
ops::{Deref, Range},
|
||||
slice,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt as _;
|
||||
|
||||
use polars::{
|
||||
datatypes::BinaryChunked,
|
||||
export::arrow::array::BinaryArray,
|
||||
prelude::{ParquetReader, SerReader},
|
||||
};
|
||||
|
||||
/// An shared `[[u8; N]]` backed by a Polars [Buffer].
|
||||
pub type FixedBytes<const N: usize> =
|
||||
ArcRef<'static, polars::export::arrow::buffer::Bytes<u8>, [[u8; N]]>;
|
||||
|
||||
pub const INDEX_NULL: u32 = !0;
|
||||
|
||||
/// A terrific hash function, turning 20 bytes of cryptographic hash
|
||||
/// into 8 bytes of cryptographic hash.
|
||||
pub fn hash64(h: &[u8; 20]) -> u64 {
|
||||
let mut buf = [0; 8];
|
||||
buf.copy_from_slice(&h[..8]);
|
||||
u64::from_ne_bytes(buf)
|
||||
}
|
||||
|
||||
pub fn leak<O, T: ?Sized>(r: OwningRef<Arc<O>, T>) -> &T {
|
||||
// SAFETY: Either `ptr` points into the `Arc`, which lives until `r` is dropped,
|
||||
// or it points at something else entirely which lives at least as long.
|
||||
unsafe {
|
||||
let ptr: *const T = r.deref();
|
||||
mem::forget(r);
|
||||
&*ptr
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a dense `store_path_hash` array from `narinfo.parquet`,
|
||||
/// returning it as an owned [FixedBytes].
|
||||
#[tracing::instrument(fields(indicatif.pb_show = tracing::field::Empty))]
|
||||
pub fn load_ph_array() -> Result<FixedBytes<20>> {
|
||||
let span = tracing::Span::current();
|
||||
|
||||
span.pb_set_message("load store_path_hash");
|
||||
span.pb_start();
|
||||
|
||||
// TODO(edef): this could use a further pushdown, since polars is more hindrance than help here
|
||||
// We know this has to fit in memory (we can't mmap it without further encoding constraints),
|
||||
// and we want a single `Vec<[u8; 20]>` of the data.
|
||||
let ph_array = into_fixed_binary_rechunk::<20>(
|
||||
ParquetReader::new(File::open("narinfo.parquet").unwrap())
|
||||
.with_columns(Some(vec!["store_path_hash".into()]))
|
||||
.set_rechunk(true)
|
||||
.finish()?
|
||||
.column("store_path_hash")?
|
||||
.binary()?,
|
||||
);
|
||||
|
||||
u32::try_from(ph_array.len()).expect("dataset exceeds 2^32");
|
||||
|
||||
Ok(ph_array)
|
||||
}
|
||||
|
||||
/// Iterator over `&[[u8; N]]` from a dense [BinaryChunked].
|
||||
pub fn as_fixed_binary<const N: usize>(
|
||||
chunked: &BinaryChunked,
|
||||
) -> impl DoubleEndedIterator<Item = &[[u8; N]]> {
|
||||
chunked.downcast_iter().map(|array| {
|
||||
let range = assert_fixed_dense::<N>(array);
|
||||
exact_chunks(&array.values()[range]).unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert a dense [BinaryChunked] into a single chunk as [FixedBytes],
|
||||
/// without taking a reference to the offsets array and validity bitmap.
|
||||
fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBytes<N> {
|
||||
let chunked = chunked.rechunk();
|
||||
let mut iter = chunked.downcast_iter();
|
||||
let array = iter.next().unwrap();
|
||||
assert!(iter.next().is_none());
|
||||
|
||||
let (buf, off, len) = {
|
||||
let range = assert_fixed_dense::<N>(array);
|
||||
array.values().clone().sliced(range.start, range.len())
|
||||
}
|
||||
.into_inner();
|
||||
|
||||
ArcRef::new(buf).map(|bytes| exact_chunks(&bytes[off..off + len]).unwrap())
|
||||
}
|
||||
|
||||
/// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
|
||||
/// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
|
||||
#[must_use = "only the range returned is guaranteed to be conformant"]
|
||||
fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
|
||||
let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
|
||||
if null_count > 0 {
|
||||
panic!("null values present");
|
||||
}
|
||||
|
||||
let offsets = array.offsets();
|
||||
let length_check = offsets
|
||||
.as_slice()
|
||||
.par_windows(2)
|
||||
.all(|w| (w[1] - w[0]) == N as i64);
|
||||
|
||||
if !length_check {
|
||||
panic!("lengths are inconsistent");
|
||||
}
|
||||
|
||||
(*offsets.first() as usize)..(*offsets.last() as usize)
|
||||
}
|
||||
|
||||
fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {
|
||||
// SAFETY: We ensure that `buf.len()` is a multiple of K, and there are no alignment requirements.
|
||||
unsafe {
|
||||
let ptr = buf.as_ptr();
|
||||
let len = buf.len();
|
||||
|
||||
if len % K != 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = ptr as *mut [u8; K];
|
||||
let len = len / K;
|
||||
|
||||
Some(slice::from_raw_parts(ptr, len))
|
||||
}
|
||||
}
|
||||
262
contrib/weave/src/main.rs
Normal file
262
contrib/weave/src/main.rs
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
|
||||
//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
|
||||
//! produced by `swizzle` to collect the closure of the roots.
|
||||
//!
|
||||
//! They are written to `live_idxs.parquet`, which only has one column, representing
|
||||
//! the row numbers in `narinfo.parquet` corresponding to live paths.
|
||||
|
||||
use anyhow::Result;
|
||||
use hashbrown::{hash_table, HashTable};
|
||||
use rayon::prelude::*;
|
||||
use rustc_hash::FxHashSet;
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs::File,
|
||||
ops::Index,
|
||||
sync::atomic::{AtomicU32, Ordering},
|
||||
};
|
||||
use tracing::{info_span, warn};
|
||||
use tracing_indicatif::span_ext::IndicatifSpanExt;
|
||||
|
||||
use polars::{
|
||||
datatypes::StaticArray,
|
||||
export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
|
||||
lazy::dsl::col,
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
use weave::{as_fixed_binary, hash64, INDEX_NULL};
|
||||
|
||||
#[tracing::instrument]
|
||||
fn main() -> Result<()> {
|
||||
let _tracing = snix_tracing::TracingBuilder::default()
|
||||
.enable_progressbar()
|
||||
.build()?;
|
||||
|
||||
let roots: PathSet32 = {
|
||||
let span = info_span!("parse_roots", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("parse roots");
|
||||
span.pb_start();
|
||||
|
||||
as_fixed_binary::<20>(
|
||||
LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
|
||||
.explode([col("store_path_hash")])
|
||||
.select([col("store_path_hash")])
|
||||
.collect()?
|
||||
.column("store_path_hash")?
|
||||
.binary()?,
|
||||
)
|
||||
.flatten()
|
||||
.collect()
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("resolve_roots", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("resolve roots");
|
||||
span.pb_start();
|
||||
|
||||
weave::load_ph_array()?
|
||||
.into_par_iter()
|
||||
.enumerate()
|
||||
.for_each(|(idx, h)| {
|
||||
if let Some(idx_slot) = roots.find(h) {
|
||||
assert_eq!(
|
||||
idx_slot.swap(idx as u32, Ordering::Relaxed),
|
||||
INDEX_NULL,
|
||||
"duplicate entry"
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let mut todo = FxHashSet::default();
|
||||
todo.reserve(roots.len());
|
||||
{
|
||||
let mut unknown_roots = 0usize;
|
||||
for (_, idx) in roots.table {
|
||||
let idx = idx.into_inner();
|
||||
if idx == INDEX_NULL {
|
||||
unknown_roots += 1;
|
||||
continue;
|
||||
}
|
||||
todo.insert(idx);
|
||||
}
|
||||
|
||||
if unknown_roots != 0 {
|
||||
warn!("skipping {unknown_roots} unknown roots");
|
||||
}
|
||||
}
|
||||
|
||||
let ri_array;
|
||||
let ri_array = {
|
||||
let span = info_span!(
|
||||
"load_reference_idxs",
|
||||
indicatif.pb_show = tracing::field::Empty
|
||||
)
|
||||
.entered();
|
||||
span.pb_set_message("load reference_idxs");
|
||||
span.pb_start();
|
||||
|
||||
ri_array = ParquetReader::new(File::open("narinfo-references.parquet")?)
|
||||
.finish()?
|
||||
.column("reference_idxs")?
|
||||
.list()?
|
||||
.clone();
|
||||
|
||||
ChunkedList::new(ri_array.downcast_iter().map(|chunk| {
|
||||
(
|
||||
chunk.offsets(),
|
||||
chunk
|
||||
.values()
|
||||
.as_any()
|
||||
.downcast_ref::<UInt32Array>()
|
||||
.unwrap()
|
||||
.as_slice()
|
||||
.unwrap(),
|
||||
)
|
||||
}))
|
||||
};
|
||||
|
||||
let mut seen = todo.clone();
|
||||
{
|
||||
let span = info_span!("mark", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("marking");
|
||||
span.pb_set_style(&snix_tracing::PB_PROGRESS_STYLE);
|
||||
|
||||
while !todo.is_empty() {
|
||||
span.pb_set_length(seen.len() as u64);
|
||||
span.pb_set_position(seen.len().saturating_sub(todo.len()) as u64);
|
||||
|
||||
todo = todo
|
||||
.par_iter()
|
||||
.flat_map(|&parent| {
|
||||
if parent == INDEX_NULL {
|
||||
return FxHashSet::default();
|
||||
}
|
||||
|
||||
ri_array[parent as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.filter(|child| !seen.contains(child))
|
||||
.collect::<FxHashSet<u32>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
for &index in &todo {
|
||||
seen.insert(index);
|
||||
}
|
||||
}
|
||||
|
||||
span.pb_set_length(seen.len() as u64);
|
||||
span.pb_set_position(seen.len() as u64);
|
||||
|
||||
if seen.remove(&INDEX_NULL) {
|
||||
warn!("WARNING: missing edges");
|
||||
}
|
||||
}
|
||||
|
||||
let seen = {
|
||||
let span = info_span!("gather_live", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("gathering live set");
|
||||
|
||||
let mut seen: Vec<u32> = seen.into_iter().collect();
|
||||
seen.par_sort();
|
||||
seen
|
||||
};
|
||||
|
||||
{
|
||||
let span = info_span!("write_output", indicatif.pb_show = tracing::field::Empty).entered();
|
||||
span.pb_set_message("writing output");
|
||||
span.pb_start();
|
||||
|
||||
ParquetWriter::new(File::create("live_idxs.parquet")?).finish(&mut df! {
|
||||
"live_idx" => seen,
|
||||
}?)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PathSet32 {
|
||||
table: HashTable<([u8; 20], AtomicU32)>,
|
||||
}
|
||||
|
||||
impl PathSet32 {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
table: HashTable::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
fn insert(&mut self, value: &[u8; 20]) -> bool {
|
||||
let hash = hash64(value);
|
||||
|
||||
match self
|
||||
.table
|
||||
.entry(hash, |(x, _)| x == value, |(x, _)| hash64(x))
|
||||
{
|
||||
hash_table::Entry::Occupied(_) => false,
|
||||
hash_table::Entry::Vacant(entry) => {
|
||||
entry.insert((*value, AtomicU32::new(INDEX_NULL)));
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn find(&self, value: &[u8; 20]) -> Option<&AtomicU32> {
|
||||
let hash = hash64(value);
|
||||
self.table
|
||||
.find(hash, |(x, _)| x == value)
|
||||
.as_ref()
|
||||
.map(|(_, x)| x)
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.table.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
|
||||
fn from_iter<T: IntoIterator<Item = &'a [u8; 20]>>(iter: T) -> Self {
|
||||
let iter = iter.into_iter();
|
||||
let mut this = Self::with_capacity(iter.size_hint().0);
|
||||
|
||||
for item in iter {
|
||||
this.insert(item);
|
||||
}
|
||||
|
||||
this.table.shrink_to_fit(|(x, _)| hash64(x));
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkedList<'a, T> {
|
||||
by_offset: BTreeMap<usize, (&'a OffsetsBuffer<i64>, &'a [T])>,
|
||||
}
|
||||
|
||||
impl<'a, T> ChunkedList<'a, T> {
|
||||
fn new(chunks: impl IntoIterator<Item = (&'a OffsetsBuffer<i64>, &'a [T])>) -> Self {
|
||||
let mut next_offset = 0usize;
|
||||
ChunkedList {
|
||||
by_offset: chunks
|
||||
.into_iter()
|
||||
.map(|(offsets, values)| {
|
||||
let offset = next_offset;
|
||||
next_offset = next_offset.checked_add(offsets.len_proxy()).unwrap();
|
||||
|
||||
(offset, (offsets, values))
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Index<usize> for ChunkedList<'a, T> {
|
||||
type Output = [T];
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
let (&base, &(offsets, values)) = self.by_offset.range(..=index).next_back().unwrap();
|
||||
let (start, end) = offsets.start_end(index - base);
|
||||
&values[start..end]
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue