feat(sterni/nix/utf8): implement UTF-8 encoding
This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
This commit is contained in:
		
							parent
							
								
									9370ea5e33
								
							
						
					
					
						commit
						87a0aaa77d
					
				
					 2 changed files with 83 additions and 2 deletions
				
			
		| 
						 | 
					@ -2,8 +2,6 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let
 | 
					let
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # TODO(sterni): encode
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  inherit (depot.users.sterni.nix)
 | 
					  inherit (depot.users.sterni.nix)
 | 
				
			||||||
    char
 | 
					    char
 | 
				
			||||||
    flow
 | 
					    flow
 | 
				
			||||||
| 
						 | 
					@ -209,8 +207,81 @@ let
 | 
				
			||||||
      ) iterResult
 | 
					      ) iterResult
 | 
				
			||||||
    );
 | 
					    );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  encodeCodepoint = cp:
 | 
				
			||||||
 | 
					    let
 | 
				
			||||||
 | 
					      # Find the amount of bytes needed to encode the given codepoint.
 | 
				
			||||||
 | 
					      # Note that this doesn't check if the Unicode codepoint is allowed,
 | 
				
			||||||
 | 
					      # but rather allows all theoretically UTF-8-encodeable ones.
 | 
				
			||||||
 | 
					      count = flow.switch cp [
 | 
				
			||||||
 | 
					        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
 | 
				
			||||||
 | 
					        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
 | 
				
			||||||
 | 
					        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
 | 
				
			||||||
 | 
					        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      # Extract the bit ranges x, y, z and u from the given codepoint
 | 
				
			||||||
 | 
					      # according to Table 3-6. from The Unicode Standard, Version 13.0,
 | 
				
			||||||
 | 
					      # section 3.9. u is split into uh and ul since they are used in
 | 
				
			||||||
 | 
					      # different bytes in the end.
 | 
				
			||||||
 | 
					      components = lib.mapAttrs (_: { mask, offset }:
 | 
				
			||||||
 | 
					        int.bitAnd (int.bitShiftR cp offset) mask
 | 
				
			||||||
 | 
					      ) {
 | 
				
			||||||
 | 
					        x = {
 | 
				
			||||||
 | 
					          mask = if count > 1 then 63 else 127;
 | 
				
			||||||
 | 
					          offset = 0;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					        y = {
 | 
				
			||||||
 | 
					          mask = if count > 2 then 63 else 31;
 | 
				
			||||||
 | 
					          offset = 6;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					        z = {
 | 
				
			||||||
 | 
					          mask = 15;
 | 
				
			||||||
 | 
					          offset = 12;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					        # u which belongs into the second byte
 | 
				
			||||||
 | 
					        ul = {
 | 
				
			||||||
 | 
					          mask = 3;
 | 
				
			||||||
 | 
					          offset = 16;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					        # u which belongs into the first byte
 | 
				
			||||||
 | 
					        uh = {
 | 
				
			||||||
 | 
					          mask = 7;
 | 
				
			||||||
 | 
					          offset = 18;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					      inherit (components) x y z ul uh;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      # Finally construct the byte sequence for the given codepoint. This is
 | 
				
			||||||
 | 
					      # usually done by using the component and adding a few bits as a prefix
 | 
				
			||||||
 | 
					      # which depends on the length of the sequence. The longer the sequence,
 | 
				
			||||||
 | 
					      # the further back each component is pushed. To simplify this, we
 | 
				
			||||||
 | 
					      # always construct a 4 element list and take the last `count` elements.
 | 
				
			||||||
 | 
					      # Thanks to laziness the bogus values created by this are never evaluated.
 | 
				
			||||||
 | 
					      #
 | 
				
			||||||
 | 
					      # Based on table 3-6. from The Unicode Standard,
 | 
				
			||||||
 | 
					      # Version 13.0, section 3.9.
 | 
				
			||||||
 | 
					      bytes = lib.sublist (4 - count) count [
 | 
				
			||||||
 | 
					        # 11110uuu
 | 
				
			||||||
 | 
					        (uh + 240)
 | 
				
			||||||
 | 
					        # 10uuzzzz or 1110zzzz
 | 
				
			||||||
 | 
					        (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
 | 
				
			||||||
 | 
					        # 10yyyyyy or 110yyyyy
 | 
				
			||||||
 | 
					        (y + (if count > 2 then 128 else 192))
 | 
				
			||||||
 | 
					        # 10xxxxxx or 0xxxxxxx
 | 
				
			||||||
 | 
					        (x + (if count > 1 then 128 else 0))
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    in string.fromBytes bytes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Encode a list of Unicode codepoints into an UTF-8 string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     Type: [ integer ] -> string
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					  encode = lib.concatMapStrings encodeCodepoint;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
in {
 | 
					in {
 | 
				
			||||||
  inherit
 | 
					  inherit
 | 
				
			||||||
 | 
					    encode
 | 
				
			||||||
    decode
 | 
					    decode
 | 
				
			||||||
    step
 | 
					    step
 | 
				
			||||||
    ;
 | 
					    ;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -113,9 +113,19 @@ let
 | 
				
			||||||
        randomUnicode
 | 
					        randomUnicode
 | 
				
			||||||
      ]));
 | 
					      ]));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
 | 
				
			||||||
 | 
					    (builtins.map
 | 
				
			||||||
 | 
					      (s: assertEq "Decoding and then encoding “${s}” yields itself"
 | 
				
			||||||
 | 
					        (utf8.encode (utf8.decode s)) s)
 | 
				
			||||||
 | 
					      (lib.flatten [
 | 
				
			||||||
 | 
					        glassSentences
 | 
				
			||||||
 | 
					        randomUnicode
 | 
				
			||||||
 | 
					      ]));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
in
 | 
					in
 | 
				
			||||||
  runTestsuite "nix.utf8" [
 | 
					  runTestsuite "nix.utf8" [
 | 
				
			||||||
    testFailures
 | 
					    testFailures
 | 
				
			||||||
    testAscii
 | 
					    testAscii
 | 
				
			||||||
    testDecoding
 | 
					    testDecoding
 | 
				
			||||||
 | 
					    testDecodingEncoding
 | 
				
			||||||
  ]
 | 
					  ]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue