mochi/mochiweb

mochijson:decode() doesn't handle surrogate pairs correctly

Opened this issue · 5 comments

mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.

For example (character 💩):

59> mochijson:decode("\"\\ud83d\\udca9\"").
[56489,55357]

But it should return instead:

59> mochijson:decode("\"\\ud83d\\udca9\"").
[128169]

Quick and dirty patch:

diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl    2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
             decode_array(Rest, S1#decoder{state=any}, Acc)
     end.
 
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+    C = dehex(C0) bor
+        (dehex(C1) bsl 4) bor
+        (dehex(C2) bsl 8) bor 
+        (dehex(C3) bsl 12),
+    if
+        C >= 16#DC00 andalso C =< 16#DFFF ->
+            case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+                [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+                _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+            end;
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+    end.
+
 tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
   when is_list(C); is_binary(C); C >= 16#7f ->
     List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
         (dehex(C1) bsl 4) bor
         (dehex(C2) bsl 8) bor 
         (dehex(C3) bsl 12),
-    tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+    if
+        C >= 16#D800 andalso C =< 16#DBFF ->
+            %% Surrogate pair
+            tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+    end;
 tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
     tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]).

A pull request with a test would be the preferred method of contribution for this, if you have the time

I'm not sure is that a proper fix actually. Don't know unicode that well.

Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons.

We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters).