feat(sterni/nix/utf8): implement UTF-8 encoding

This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
2021-11-23 19:23:54 +01:00 · 2021-11-23 19:23:54 +01:00 · 87a0aaa77d
commit 87a0aaa77d
parent 9370ea5e33
2 changed files with 83 additions and 2 deletions
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@ -2,8 +2,6 @@
 let
  # TODO(sterni): encode
  inherit (depot.users.sterni.nix)
    char
    flow
@ -209,8 +207,81 @@ let
      ) iterResult
    );
  encodeCodepoint = cp:
    let
      # Find the amount of bytes needed to encode the given codepoint.
      # Note that this doesn't check if the Unicode codepoint is allowed,
      # but rather allows all theoretically UTF-8-encodeable ones.
      count = flow.switch cp [
        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
      ];
      # Extract the bit ranges x, y, z and u from the given codepoint
      # according to Table 3-6. from The Unicode Standard, Version 13.0,
      # section 3.9. u is split into uh and ul since they are used in
      # different bytes in the end.
      components = lib.mapAttrs (_: { mask, offset }:
        int.bitAnd (int.bitShiftR cp offset) mask
      ) {
        x = {
          mask = if count > 1 then 63 else 127;
          offset = 0;
        };
        y = {
          mask = if count > 2 then 63 else 31;
          offset = 6;
        };
        z = {
          mask = 15;
          offset = 12;
        };
        # u which belongs into the second byte
        ul = {
          mask = 3;
          offset = 16;
        };
        # u which belongs into the first byte
        uh = {
          mask = 7;
          offset = 18;
        };
      };
      inherit (components) x y z ul uh;
      # Finally construct the byte sequence for the given codepoint. This is
      # usually done by using the component and adding a few bits as a prefix
      # which depends on the length of the sequence. The longer the sequence,
      # the further back each component is pushed. To simplify this, we
      # always construct a 4 element list and take the last `count` elements.
      # Thanks to laziness the bogus values created by this are never evaluated.
      #
      # Based on table 3-6. from The Unicode Standard,
      # Version 13.0, section 3.9.
      bytes = lib.sublist (4 - count) count [
        # 11110uuu
        (uh + 240)
        # 10uuzzzz or 1110zzzz
        (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
        # 10yyyyyy or 110yyyyy
        (y + (if count > 2 then 128 else 192))
        # 10xxxxxx or 0xxxxxxx
        (x + (if count > 1 then 128 else 0))
      ];
    in string.fromBytes bytes;
  /* Encode a list of Unicode codepoints into an UTF-8 string.
     Type: [ integer ] -> string
  */
  encode = lib.concatMapStrings encodeCodepoint;
 in {
  inherit
    encode
    decode
    step
    ;
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@ -113,9 +113,19 @@ let
        randomUnicode
      ]));
  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
    (builtins.map
      (s: assertEq "Decoding and then encoding “${s}” yields itself"
        (utf8.encode (utf8.decode s)) s)
      (lib.flatten [
        glassSentences
        randomUnicode
      ]));
 in
  runTestsuite "nix.utf8" [
    testFailures
    testAscii
    testDecoding
    testDecodingEncoding
  ]