mochijson:decode() doesn't handle surrogate pairs correctly
Opened this issue · 5 comments
sigsergv commented
mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.
For example (character 💩):
59> mochijson:decode("\"\\ud83d\\udca9\"").
[56489,55357]
But it should return instead:
59> mochijson:decode("\"\\ud83d\\udca9\"").
[128169]
sigsergv commented
Quick and dirty patch:
diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl 2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
decode_array(Rest, S1#decoder{state=any}, Acc)
end.
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+ C = dehex(C0) bor
+ (dehex(C1) bsl 4) bor
+ (dehex(C2) bsl 8) bor
+ (dehex(C3) bsl 12),
+ if
+ C >= 16#DC00 andalso C =< 16#DFFF ->
+ case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+ [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+ _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+ end;
+ true ->
+ tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+ end.
+
tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
when is_list(C); is_binary(C); C >= 16#7f ->
List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
(dehex(C1) bsl 4) bor
(dehex(C2) bsl 8) bor
(dehex(C3) bsl 12),
- tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+ if
+ C >= 16#D800 andalso C =< 16#DBFF ->
+ %% Surrogate pair
+ tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+ true ->
+ tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+ end;
tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]).
etrepum commented
A pull request with a test would be the preferred method of contribution for this, if you have the time
sigsergv commented
I'm not sure is that a proper fix actually. Don't know unicode that well.
etrepum commented
Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons.
sigsergv commented
We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters).