mochijson:decode() doesn't handle surrogate pairs correctly

Question

mochijson:decode() doesn't handle surrogate pairs correctly

Opened this issue 7 years ago · 5 comments

mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.

For example (character 💩):

59> mochijson:decode("\"\\ud83d\\udca9\"").
[56489,55357]

But it should return instead:

59> mochijson:decode("\"\\ud83d\\udca9\"").
[128169]

Answer 1 · 2017-12-28T11:07:37.000Z

Quick and dirty patch:

diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl    2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
             decode_array(Rest, S1#decoder{state=any}, Acc)
     end.
 
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+    C = dehex(C0) bor
+        (dehex(C1) bsl 4) bor
+        (dehex(C2) bsl 8) bor 
+        (dehex(C3) bsl 12),
+    if
+        C >= 16#DC00 andalso C =< 16#DFFF ->
+            case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+                [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+                _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+            end;
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+    end.
+
 tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
   when is_list(C); is_binary(C); C >= 16#7f ->
     List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
         (dehex(C1) bsl 4) bor
         (dehex(C2) bsl 8) bor 
         (dehex(C3) bsl 12),
-    tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+    if
+        C >= 16#D800 andalso C =< 16#DBFF ->
+            %% Surrogate pair
+            tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+    end;
 tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
     tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]).

Answer 2 · 2017-12-28T15:20:17.000Z

A pull request with a test would be the preferred method of contribution for this, if you have the time

Answer 3 · 2017-12-28T15:58:07.000Z

I'm not sure is that a proper fix actually. Don't know unicode that well.

Answer 4 · 2017-12-28T21:51:41.000Z

Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons.

Answer 5 · 2017-12-29T02:09:54.000Z

We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters).