Invalid utf8 can be (wrongly) decoded into a string
harrysarson opened this issue · 1 comments
harrysarson commented
Bytes.Decode.string
will decode bytes that are not valid utf8 and produces an nonsense string. Instead it should fail. Thanks @jhbrown94 for helping me verify this.
SSCCE
module Main exposing (main)
import Browser
import Html exposing (Html, button, div, text)
import Html.Events exposing (onClick)
import Bytes exposing (Bytes)
import Bytes.Decode as Decode
import Bytes.Encode as Encode
bytes : Bytes
bytes =
Encode.encode
(Encode.sequence [Encode.unsignedInt8 0xC0, Encode.unsignedInt8 0])
string : Maybe String
string =
Decode.decode (Decode.string 2) bytes
update : () -> () -> ()
update () () = ()
view : () -> Html never
view () =
div []
[ div [] [ text <| Debug.toString string ]
]
main : Program () () ()
main =
Browser.sandbox
{ init = ()
, view = view
, update = update
}
Prints
Just "\0"
Should Print
Nothing
Ellie
Confirmation that b"\xC0\x00"
is not unicode
harrysarson commented
(Cross posting from slack.) Currently the kernel code backing string decoding looks like:
var _Bytes_read_string = F3(function(len, bytes, offset)
{
var string = '';
var end = offset + len;
for (; offset < end;)
{
var byte = bytes.getUint8(offset++);
string +=
(byte < 128)
? String.fromCharCode(byte)
:
((byte & 0xE0 /* 0b11100000 */) === 0xC0 /* 0b11000000 */)
? String.fromCharCode((byte & 0x1F /* 0b00011111 */) << 6 | bytes.getUint8(offset++) & 0x3F /* 0b00111111 */)
:
((byte & 0xF0 /* 0b11110000 */) === 0xE0 /* 0b11100000 */)
? String.fromCharCode(
(byte & 0xF /* 0b00001111 */) << 12
| (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 6
| bytes.getUint8(offset++) & 0x3F /* 0b00111111 */
)
:
(byte =
((byte & 0x7 /* 0b00000111 */) << 18
| (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 12
| (bytes.getUint8(offset++) & 0x3F /* 0b00111111 */) << 6
| bytes.getUint8(offset++) & 0x3F /* 0b00111111 */
) - 0x10000
, String.fromCharCode(Math.floor(byte / 0x400) + 0xD800, byte % 0x400 + 0xDC00)
);
}
return __Utils_Tuple2(offset, string);
});
An alternative that correctly fails to decode non utf8 byte sequences would be:
var _Bytes_read_string = F3(function (len, bytes, offset) {
var decoder = new TextDecoder('utf8', { fatal: true});
var sliceView = new DataView(bytes.buffer, bytes.byteOffset + offset, len);
return __Utils_Tuple2(offset + len, decoder.decode(sliceView));
});
(although this version does not support IE)