users.sterni.nix.utf8 implements UTF-8 decoding in pure nix. We implement the decoding as a simple state machine which is fed one byte at a time. Decoding whole strings is possible by subsequently calling step. This is done in decode which uses builtins.foldl' to get around recursion restrictions and a neat trick using builtins.deepSeq puck showed me limiting the size of the thunks in a foldl' (which can also cause a stack overflow). This makes decoding arbitrarily large UTF-8 files into codepoints using nix theoretically possible, but it is not really practical: Decoding a 36KB LaTeX file I had lying around takes ~160s on my laptop. Change-Id: Iab8c973dac89074ec280b4880a7408e0b3d19bc7 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2590 Tested-by: BuildkiteCI Reviewed-by: sterni <sternenseemann@systemli.org>
		
			
				
	
	
		
			124 lines
		
	
	
	
		
			2.6 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
			
		
		
	
	
			124 lines
		
	
	
	
		
			2.6 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
{ depot, lib, ... }:
 | 
						|
 | 
						|
let
 | 
						|
 | 
						|
  # TODO(sterni): implement nix.float and figure out which of these
 | 
						|
  #               functions can be split out into a common nix.num
 | 
						|
  #               library.
 | 
						|
 | 
						|
  inherit (depot.users.sterni.nix)
 | 
						|
    string
 | 
						|
    ;
 | 
						|
 | 
						|
  inherit (builtins)
 | 
						|
    bitOr
 | 
						|
    bitAnd
 | 
						|
    bitXor
 | 
						|
    mul
 | 
						|
    div
 | 
						|
    add
 | 
						|
    sub
 | 
						|
    ;
 | 
						|
 | 
						|
  abs = i: if i < 0 then -i else i;
 | 
						|
 | 
						|
  exp = base: pow:
 | 
						|
    if pow > 0
 | 
						|
    then base * (exp base (pow - 1))
 | 
						|
    else if pow < 0
 | 
						|
    then 1.0 / exp base (abs pow)
 | 
						|
    else 1;
 | 
						|
 | 
						|
  bitShiftR = bit: count:
 | 
						|
    if count == 0
 | 
						|
    then bit
 | 
						|
    else div (bitShiftR bit (count - 1)) 2;
 | 
						|
 | 
						|
  bitShiftL = bit: count:
 | 
						|
    if count == 0
 | 
						|
    then bit
 | 
						|
    else 2 * (bitShiftL bit (count - 1));
 | 
						|
 | 
						|
  hexdigits = "0123456789ABCDEF";
 | 
						|
 | 
						|
  toHex = int:
 | 
						|
    let
 | 
						|
      go = i:
 | 
						|
        if i == 0
 | 
						|
        then ""
 | 
						|
        else go (bitShiftR i 4)
 | 
						|
           + string.charAt (bitAnd i 15) hexdigits;
 | 
						|
      sign = lib.optionalString (int < 0) "-";
 | 
						|
    in
 | 
						|
      if int == 0
 | 
						|
      then "0"
 | 
						|
      else "${sign}${go (abs int)}";
 | 
						|
 | 
						|
  fromHexMap = builtins.listToAttrs
 | 
						|
    (lib.imap0 (i: c: { name = c; value = i; })
 | 
						|
      (lib.stringToCharacters hexdigits));
 | 
						|
 | 
						|
  fromHex = literal:
 | 
						|
    let
 | 
						|
      negative = string.charAt 0 literal == "-";
 | 
						|
      start = if negative then 1 else 0;
 | 
						|
      len = builtins.stringLength literal;
 | 
						|
      # reversed list of all digits
 | 
						|
      digits = builtins.genList
 | 
						|
        (i: string.charAt (len - 1 - i) literal)
 | 
						|
        (len - start);
 | 
						|
      parsed = builtins.foldl'
 | 
						|
        (v: d: {
 | 
						|
          val = v.val + (fromHexMap."${d}" * v.mul);
 | 
						|
          mul = v.mul * 16;
 | 
						|
        })
 | 
						|
        { val = 0; mul = 1; } digits;
 | 
						|
    in
 | 
						|
      if negative
 | 
						|
      then -parsed.val
 | 
						|
      else parsed.val;
 | 
						|
 | 
						|
  # A nix integer is a 64bit signed integer
 | 
						|
  maxBound = 9223372036854775807;
 | 
						|
 | 
						|
  # fun fact: -9223372036854775808 is the lower bound
 | 
						|
  # for a nix integer (as you would expect), but you can't
 | 
						|
  # use it as an integer literal or you'll be greeted with:
 | 
						|
  # error: invalid integer '9223372036854775808'
 | 
						|
  # This is because all int literals when parsing are
 | 
						|
  # positive, negative "literals" are positive literals
 | 
						|
  # which are preceded by the arithmetric negation operator.
 | 
						|
  minBound = -9223372036854775807 - 1;
 | 
						|
 | 
						|
  odd = x: bitAnd x 1 == 1;
 | 
						|
  even = x: bitAnd x 1 == 0;
 | 
						|
 | 
						|
  # div and mod behave like quot and rem in Haskell,
 | 
						|
  # i. e. they truncate towards 0
 | 
						|
  mod = a: b: let res = a / b; in a - (res * b);
 | 
						|
 | 
						|
  inRange = a: b: x: x >= a && x <= b;
 | 
						|
 | 
						|
in {
 | 
						|
  inherit
 | 
						|
    maxBound
 | 
						|
    minBound
 | 
						|
    abs
 | 
						|
    exp
 | 
						|
    odd
 | 
						|
    even
 | 
						|
    add
 | 
						|
    sub
 | 
						|
    mul
 | 
						|
    div
 | 
						|
    mod
 | 
						|
    bitShiftR
 | 
						|
    bitShiftL
 | 
						|
    bitOr
 | 
						|
    bitAnd
 | 
						|
    bitXor
 | 
						|
    toHex
 | 
						|
    fromHex
 | 
						|
    inRange
 | 
						|
    ;
 | 
						|
}
 |