fix(tvix): Represent strings as byte arrays

C++ nix uses C-style zero-terminated char pointers to represent strings
internally - however, up to this point, tvix has used Rust `String` and
`str` for string values. Since those are required to be valid utf-8, we
haven't been able to properly represent all the string values that Nix
supports.

To fix that, this change converts the internal representation of the
NixString struct from `Box<str>` to `BString`, from the `bstr` crate -
this is a wrapper around a `Vec<u8>` with extra functions for treating
that byte vector as a "morally string-like" value, which is basically
exactly what we need.

Since this changes a pretty fundamental assumption about a pretty core
type, there are a *lot* of changes in a lot of places to make this work,
but I've tried to keep the general philosophy and intent of most of the
code in most places intact. Most notably, there's nothing that's been
done to make the derivation stuff in //tvix/glue work with non-utf8
strings everywhere, instead opting to just convert to String/str when
passing things into that - there *might* be something to be done there,
but I don't know what the rules should be and I don't want to figure
them out in this change.

To deal with OS-native paths in a way that also works in WASM for
tvixbolt, this also adds a dependency on the "os_str_bytes" crate.

Fixes: b/189
Fixes: b/337
Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200
Reviewed-by: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Reviewed-by: sterni <sternenseemann@systemli.org>
Autosubmit: aspen <root@gws.fyi>
Tested-by: BuildkiteCI
This commit is contained in:
Aspen Smith 2023-12-05 17:25:52 -05:00 committed by aspen
parent 6f9e25943f
commit 201173afac
24 changed files with 427 additions and 223 deletions

View file

@ -12,6 +12,7 @@
pub mod generators;
mod macros;
use bstr::{BString, ByteSlice, ByteVec};
use codemap::Span;
use serde_json::json;
use std::{cmp::Ordering, collections::HashMap, ops::DerefMut, path::PathBuf, rc::Rc};
@ -550,14 +551,14 @@ where
let key = key.to_str().with_span(&frame, self)?;
let attrs = attrs.to_attrs().with_span(&frame, self)?;
match attrs.select(key.as_str()) {
match attrs.select(&key) {
Some(value) => self.stack.push(value.clone()),
None => {
return frame.error(
self,
ErrorKind::AttributeNotFound {
name: key.as_str().to_string(),
name: (**key).clone().into_string_lossy()
},
);
}
@ -598,7 +599,7 @@ where
OpCode::OpAttrsTrySelect => {
let key = self.stack_pop().to_str().with_span(&frame, self)?;
let value = match self.stack_pop() {
Value::Attrs(attrs) => match attrs.select(key.as_str()) {
Value::Attrs(attrs) => match attrs.select(&key) {
Some(value) => value.clone(),
None => Value::AttrNotFound,
},
@ -705,7 +706,7 @@ where
self(key, attrs) => {
let key = key.to_str().with_span(&frame, self)?;
let result = match attrs {
Value::Attrs(attrs) => attrs.contains(key.as_str()),
Value::Attrs(attrs) => attrs.contains(&key),
// Nix allows use of `?` on non-set types, but
// always returns false in those cases.
@ -742,7 +743,7 @@ where
self.enqueue_generator("resolve_with", op_span, |co| {
resolve_with(
co,
ident.as_str().to_owned(),
ident.as_bstr().to_owned(),
with_stack_len,
closed_with_stack_len,
)
@ -966,7 +967,7 @@ where
/// fragments of the stack, evaluating them to strings, and pushing
/// the concatenated result string back on the stack.
fn run_interpolate(&mut self, frame: &CallFrame, count: usize) -> EvalResult<()> {
let mut out = String::new();
let mut out = BString::default();
// Interpolation propagates the context and union them.
let mut context: NixContext = NixContext::new();
@ -980,7 +981,7 @@ where
return Ok(());
}
let mut nix_string = val.to_contextful_str().with_span(frame, self)?;
out.push_str(nix_string.as_str());
out.push_str(nix_string.as_bstr());
if let Some(nix_string_ctx) = nix_string.context_mut() {
context = context.join(nix_string_ctx);
}
@ -988,7 +989,7 @@ where
// FIXME: consume immediately here the String.
self.stack
.push(Value::String(NixString::new_context_from(context, &out)));
.push(Value::String(NixString::new_context_from(context, out)));
Ok(())
}
@ -1160,7 +1161,7 @@ where
/// for matching values in the with-stacks carried at runtime.
async fn resolve_with(
co: GenCo,
ident: String,
ident: BString,
vm_with_len: usize,
upvalue_with_len: usize,
) -> Result<Value, ErrorKind> {
@ -1213,7 +1214,7 @@ async fn resolve_with(
}
}
Err(ErrorKind::UnknownDynamicVariable(ident))
Err(ErrorKind::UnknownDynamicVariable(ident.to_string()))
}
// TODO(amjoseph): de-asyncify this
@ -1221,7 +1222,7 @@ async fn add_values(co: GenCo, a: Value, b: Value) -> Result<Value, ErrorKind> {
// What we try to do is solely determined by the type of the first value!
let result = match (a, b) {
(Value::Path(p), v) => {
let mut path = p.to_string_lossy().into_owned();
let mut path = p.into_os_string();
match generators::request_string_coerce(
&co,
v,
@ -1243,7 +1244,7 @@ async fn add_values(co: GenCo, a: Value, b: Value) -> Result<Value, ErrorKind> {
.await
{
Ok(vs) => {
path.push_str(vs.as_str());
path.push(vs.to_os_str()?);
crate::value::canon_path(PathBuf::from(path)).into()
}
Err(c) => Value::Catchable(c),