feat(sterni/nix/utf8): check if codepoint valid/encodeable
* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if the given codepoint exceeds the range (encoding U+0000 won't work of course, but this is Nix's fault…). * Check if the produced bytes are well formed and output an error if not. This indicates that the codepoint can't be encoded as UTF-8, like U+D800 which is reserved for UTF-16. Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
This commit is contained in:
parent
8dc54f89cd
commit
750ef6c693
2 changed files with 42 additions and 4 deletions
|
|
@ -55,13 +55,23 @@ let
|
|||
hexDecode = l:
|
||||
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
|
||||
|
||||
testFailures = it "checks UTF-8 decoding failures" [
|
||||
hexEncode = l: utf8.encode (builtins.map int.fromHex l);
|
||||
|
||||
testFailures = it "checks UTF-8 decoding failures" ([
|
||||
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
|
||||
# examples from The Unicode Standard
|
||||
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
|
||||
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
|
||||
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
|
||||
];
|
||||
(assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
|
||||
(assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
|
||||
] ++ builtins.genList (i:
|
||||
let
|
||||
cp = i + int.fromHex "D800";
|
||||
in
|
||||
assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
|
||||
(utf8.encode [ cp ])
|
||||
) (int.fromHex "07FF"));
|
||||
|
||||
testAscii = it "checks decoding of ascii strings"
|
||||
(builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue