Small strings are always copied fully, without allocations. Large transient strings copy the data and allocate. Large persistent strings are also a trivial copy. Change-Id: I319c0b800fa7a4a62e634176b959bb2fa766a4eb Reviewed-on: https://cl.tvl.fyi/c/depot/+/12342 Autosubmit: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
435 lines
13 KiB
Rust
435 lines
13 KiB
Rust
use std::alloc::Layout;
|
|
use std::cmp::Ordering;
|
|
use std::fmt::{Debug, Formatter};
|
|
|
|
#[derive(Clone, Copy)]
|
|
#[repr(C)]
|
|
struct GSSmall {
|
|
len: u32,
|
|
data: [u8; 12],
|
|
}
|
|
|
|
#[derive(Clone, Copy)]
|
|
#[repr(transparent)]
|
|
struct StorageClassPtr(usize);
|
|
|
|
impl StorageClassPtr {
|
|
fn transient(ptr: *const u8) -> Self {
|
|
debug_assert!(
|
|
(ptr as usize & 0b1) == 0,
|
|
"pointer must be at least 2-byte aligned"
|
|
);
|
|
Self(ptr as usize)
|
|
}
|
|
|
|
fn persistent(ptr: *const u8) -> Self {
|
|
debug_assert!(
|
|
(ptr as usize & 0b1) == 0,
|
|
"pointer must be at least 2-byte aligned"
|
|
);
|
|
Self((ptr as usize) | 0b1)
|
|
}
|
|
|
|
fn as_ptr(&self) -> *const u8 {
|
|
(self.0 & !0b1) as *const u8
|
|
}
|
|
|
|
unsafe fn as_mut_ptr(&self) -> *mut u8 {
|
|
(self.0 & !0b1) as *mut u8
|
|
}
|
|
|
|
fn is_transient(&self) -> bool {
|
|
(self.0 & 0b1) == 0
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy)]
|
|
#[repr(C)]
|
|
struct GSLarge {
|
|
len: u32,
|
|
prefix: [u8; 4],
|
|
data: StorageClassPtr,
|
|
}
|
|
|
|
const _ASSERT_VARIANTS_SIZE: () = assert!(
|
|
std::mem::size_of::<GSSmall>() == std::mem::size_of::<GSLarge>(),
|
|
"German String variants must have the same size"
|
|
);
|
|
|
|
union GSRepr {
|
|
small: GSSmall,
|
|
large: GSLarge,
|
|
}
|
|
|
|
#[repr(transparent)]
|
|
pub struct GermanString(GSRepr);
|
|
|
|
const _ASSERT_GSTRING_SIZE: () = assert!(
|
|
std::mem::size_of::<GermanString>() == 16,
|
|
"German String should be 16 bytes in size",
|
|
);
|
|
|
|
impl GermanString {
|
|
/// Creates a new transient German String from the given slice, copying the
|
|
/// data in the process.
|
|
pub fn transient(bytes: &[u8]) -> GermanString {
|
|
if bytes.len() > u32::MAX as usize {
|
|
panic!("GermanString maximum length is {} bytes", u32::MAX);
|
|
}
|
|
|
|
if bytes.len() <= 12 {
|
|
let mut s = GSSmall {
|
|
len: bytes.len() as u32,
|
|
data: [0u8; 12],
|
|
};
|
|
s.data[..bytes.len()].copy_from_slice(bytes);
|
|
GermanString(GSRepr { small: s })
|
|
} else {
|
|
let layout = Layout::array::<u8>(bytes.len()).unwrap();
|
|
let mut large = GSLarge {
|
|
len: bytes.len() as u32,
|
|
prefix: [0u8; 4],
|
|
data: unsafe {
|
|
let ptr = std::alloc::alloc(layout);
|
|
if ptr.is_null() {
|
|
std::alloc::handle_alloc_error(layout);
|
|
}
|
|
std::ptr::copy_nonoverlapping(bytes.as_ptr(), ptr, bytes.len());
|
|
StorageClassPtr::transient(ptr)
|
|
},
|
|
};
|
|
|
|
large.prefix.copy_from_slice(&bytes[..4]);
|
|
|
|
GermanString(GSRepr { large })
|
|
}
|
|
}
|
|
|
|
/// Creates a new transient German String from the given owned bytes. Short
|
|
/// strings will be copied into the string representation, long strings will
|
|
/// be moved out of the given vector without additional allocations.
|
|
pub fn transient_from_owned(bytes: Vec<u8>) -> GermanString {
|
|
if bytes.len() > u32::MAX as usize {
|
|
panic!("GermanString maximum length is {} bytes", u32::MAX);
|
|
}
|
|
|
|
if bytes.len() <= 12 {
|
|
let mut s = GSSmall {
|
|
len: bytes.len() as u32,
|
|
data: [0u8; 12],
|
|
};
|
|
|
|
s.data[..bytes.len()].copy_from_slice(&bytes);
|
|
GermanString(GSRepr { small: s })
|
|
} else {
|
|
let md = std::mem::ManuallyDrop::new(bytes);
|
|
let mut large = GSLarge {
|
|
len: md.len() as u32,
|
|
prefix: [0u8; 4],
|
|
data: StorageClassPtr::transient(md.as_ptr()),
|
|
};
|
|
|
|
large.prefix.copy_from_slice(&md[..4]);
|
|
GermanString(GSRepr { large })
|
|
}
|
|
}
|
|
|
|
/// Creates a persistent German String from a static data buffer.
|
|
pub fn persistent(bytes: &'static [u8]) -> GermanString {
|
|
if bytes.len() > u32::MAX as usize {
|
|
panic!("GermanString maximum length is {} bytes", u32::MAX);
|
|
}
|
|
|
|
if bytes.len() <= 12 {
|
|
let mut s = GSSmall {
|
|
len: bytes.len() as u32,
|
|
data: [0u8; 12],
|
|
};
|
|
|
|
s.data[..bytes.len()].copy_from_slice(&bytes);
|
|
GermanString(GSRepr { small: s })
|
|
} else {
|
|
let mut large = GSLarge {
|
|
len: bytes.len() as u32,
|
|
prefix: [0u8; 4],
|
|
data: StorageClassPtr::persistent(bytes.as_ptr()),
|
|
};
|
|
|
|
large.prefix.copy_from_slice(&bytes[..4]);
|
|
GermanString(GSRepr { large })
|
|
}
|
|
}
|
|
|
|
/// Creates a persistent German String by leaking the provided data.
|
|
pub fn persistent_leak(bytes: Vec<u8>) -> GermanString {
|
|
if bytes.len() > u32::MAX as usize {
|
|
panic!("GermanString maximum length is {} bytes", u32::MAX);
|
|
}
|
|
|
|
if bytes.len() <= 12 {
|
|
let mut s = GSSmall {
|
|
len: bytes.len() as u32,
|
|
data: [0u8; 12],
|
|
};
|
|
|
|
s.data[..bytes.len()].copy_from_slice(&bytes);
|
|
GermanString(GSRepr { small: s })
|
|
} else {
|
|
let md = std::mem::ManuallyDrop::new(bytes);
|
|
let mut large = GSLarge {
|
|
len: md.len() as u32,
|
|
prefix: [0u8; 4],
|
|
data: StorageClassPtr::persistent(md.as_ptr()),
|
|
};
|
|
|
|
large.prefix.copy_from_slice(&md[..4]);
|
|
GermanString(GSRepr { large })
|
|
}
|
|
}
|
|
|
|
/// Creates a persistent German String from a static data buffer.
|
|
pub fn persistent_from_str(s: &'static str) -> GermanString {
|
|
GermanString::persistent(s.as_bytes())
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
// SAFETY: The length field is located in the same location for both
|
|
// variants, reading it from either is safe.
|
|
unsafe { self.0.small.len as usize }
|
|
}
|
|
|
|
pub fn as_bytes(&self) -> &[u8] {
|
|
if self.len() > 12 {
|
|
unsafe { std::slice::from_raw_parts(self.0.large.data.as_ptr(), self.len()) }
|
|
} else {
|
|
unsafe { &self.0.small.data.as_ref()[..self.len()] }
|
|
}
|
|
}
|
|
|
|
pub fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
|
|
std::str::from_utf8(self.as_bytes())
|
|
}
|
|
}
|
|
|
|
impl Drop for GermanString {
|
|
fn drop(&mut self) {
|
|
unsafe {
|
|
if self.len() > 12 && self.0.large.data.is_transient() {
|
|
let layout = Layout::array::<u8>(self.len()).unwrap();
|
|
std::alloc::dealloc(self.0.large.data.as_mut_ptr(), layout);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl PartialEq for GermanString {
|
|
fn eq(&self, other: &GermanString) -> bool {
|
|
if self.len() != other.len() {
|
|
return false;
|
|
}
|
|
|
|
unsafe {
|
|
if self.len() <= 12 {
|
|
return self.0.small.data[..self.len()] == other.0.small.data[..other.len()];
|
|
}
|
|
return self.0.large.data.as_ptr() == other.0.large.data.as_ptr()
|
|
|| (self.0.large.prefix == other.0.large.prefix
|
|
&& self.as_bytes() == other.as_bytes());
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Eq for GermanString {}
|
|
|
|
impl Ord for GermanString {
|
|
fn cmp(&self, other: &GermanString) -> Ordering {
|
|
match (self.len().cmp(&12), other.len().cmp(&12)) {
|
|
// two small strings
|
|
(Ordering::Less | Ordering::Equal, Ordering::Less | Ordering::Equal) => unsafe {
|
|
self.0.small.data[..self.len()].cmp(&other.0.small.data[..other.len()])
|
|
},
|
|
// two large strings
|
|
(Ordering::Greater, Ordering::Greater) => unsafe {
|
|
match self.0.large.prefix.cmp(&other.0.large.prefix) {
|
|
Ordering::Equal => self.as_bytes().cmp(other.as_bytes()),
|
|
ordering => ordering,
|
|
}
|
|
},
|
|
|
|
// LHS large, RHS small
|
|
(Ordering::Greater, _) => {
|
|
let prefix_ordering =
|
|
unsafe { self.0.large.prefix.as_slice().cmp(&other.0.small.data[..4]) };
|
|
|
|
if prefix_ordering != Ordering::Equal {
|
|
return prefix_ordering;
|
|
}
|
|
|
|
self.as_bytes().cmp(other.as_bytes())
|
|
}
|
|
|
|
// LHS small, RHS large
|
|
(_, Ordering::Greater) => {
|
|
let prefix_ordering =
|
|
unsafe { self.0.small.data[..4].cmp(other.0.large.prefix.as_slice()) };
|
|
|
|
if prefix_ordering != Ordering::Equal {
|
|
return prefix_ordering;
|
|
}
|
|
|
|
self.as_bytes().cmp(other.as_bytes())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl PartialOrd for GermanString {
|
|
fn partial_cmp(&self, other: &GermanString) -> Option<Ordering> {
|
|
Some(self.cmp(other))
|
|
}
|
|
}
|
|
|
|
impl Debug for GermanString {
|
|
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
|
String::from_utf8_lossy(self.as_bytes()).fmt(f)
|
|
}
|
|
}
|
|
|
|
impl Clone for GermanString {
|
|
fn clone(&self) -> Self {
|
|
unsafe {
|
|
if self.len() <= 12 {
|
|
return GermanString(GSRepr {
|
|
small: self.0.small.clone(),
|
|
});
|
|
}
|
|
|
|
if self.0.large.data.is_transient() {
|
|
return GermanString::transient(self.as_bytes());
|
|
}
|
|
|
|
return GermanString(GSRepr {
|
|
large: self.0.large.clone(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use proptest::prelude::*;
|
|
|
|
impl Arbitrary for GermanString {
|
|
type Parameters = <String as Arbitrary>::Parameters;
|
|
type Strategy = BoxedStrategy<Self>;
|
|
|
|
fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
|
|
any_with::<String>(args)
|
|
.prop_map(|s| GermanString::transient(s.as_bytes()))
|
|
.boxed()
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_string() {
|
|
let empty = GermanString::transient(b"");
|
|
|
|
assert_eq!(empty.len(), 0, "empty string should be empty");
|
|
assert_eq!(empty.as_bytes(), b"", "empty string should contain nothing");
|
|
assert_eq!(
|
|
empty.as_str().expect("empty string is valid UTF-8"),
|
|
"",
|
|
"empty string should contain empty string"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_short_string() {
|
|
let short = GermanString::transient(b"meow");
|
|
|
|
assert_eq!(short.len(), 4, "'meow' is four characters");
|
|
assert_eq!(
|
|
short.as_bytes(),
|
|
b"meow",
|
|
"short string returns correct bytes"
|
|
);
|
|
assert_eq!(
|
|
short.as_str().expect("'meow' is valid UTF-8"),
|
|
"meow",
|
|
"short string returns correct string"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_long_string() {
|
|
let input: &str = "This code was written at https://signal.live";
|
|
let long = GermanString::transient(input.as_bytes());
|
|
|
|
assert_eq!(long.len(), 44, "long string has correct length");
|
|
assert_eq!(
|
|
long.as_bytes(),
|
|
input.as_bytes(),
|
|
"long string returns correct bytes"
|
|
);
|
|
|
|
assert_eq!(
|
|
long.as_str().expect("input is valid UTF-8"),
|
|
input,
|
|
"long string returns correct string"
|
|
);
|
|
}
|
|
|
|
proptest! {
|
|
#[test]
|
|
fn test_roundtrip_vec(input: Vec<u8>) {
|
|
let gs = GermanString::transient_from_owned(input.clone());
|
|
assert_eq!(input.len(), gs.len(), "length should match");
|
|
|
|
let out = gs.as_bytes().to_owned();
|
|
assert_eq!(input, out, "roundtrip should yield same bytes");
|
|
}
|
|
|
|
#[test]
|
|
fn test_roundtrip_string(input: String) {
|
|
let gs = GermanString::transient_from_owned(input.clone().into_bytes());
|
|
assert_eq!(input.len(), gs.len(), "length should match");
|
|
|
|
let out = String::from_utf8(gs.as_bytes().to_owned())
|
|
.expect("string should be valid after roundtrip");
|
|
|
|
assert_eq!(input, out, "roundtrip should yield same string");
|
|
}
|
|
|
|
// Test [`Eq`] implementation.
|
|
#[test]
|
|
fn test_eq(lhs: Vec<u8>, rhs: Vec<u8>) {
|
|
let lhs_gs = GermanString::transient(lhs.as_slice());
|
|
let rhs_gs = GermanString::transient(rhs.as_slice());
|
|
|
|
assert_eq!(
|
|
(lhs == rhs),
|
|
(lhs_gs == rhs_gs),
|
|
"Eq should match between std::String and GermanString ({:?} == {:?})",
|
|
lhs, rhs,
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_reflexivity(x: GermanString) {
|
|
prop_assert!(x == x);
|
|
}
|
|
|
|
#[test]
|
|
fn test_symmetry(x: GermanString, y: GermanString) {
|
|
prop_assert_eq!(x == y, y == x);
|
|
}
|
|
|
|
#[test]
|
|
fn test_transitivity(x: GermanString, y: GermanString, z: GermanString) {
|
|
if x == y && y == z {
|
|
assert!(x == z);
|
|
}
|
|
}
|
|
}
|
|
}
|