VBA-tools/VBA-JSON

Cyrillic

Opened this issue · 2 comments

Private Function json_Encode(ByVal json_Text As Variant) As String
' Reference: http://www.ietf.org/rfc/rfc4627.txt
' Escape: ", , /, backspace, form feed, line feed, carriage return, tab
Dim json_Index As Long
Dim json_Char As String
Dim json_AscCode As Long
Dim json_Buffer As String
Dim json_BufferPosition As Long
Dim json_BufferLength As Long

For json_Index = 1 To VBA.Len(json_Text)
    json_Char = VBA.Mid$(json_Text, json_Index, 1)
    json_AscCode = VBA.AscW(json_Char)

    ' When AscW returns a negative number, it returns the twos complement form of that number.
    ' To convert the twos complement notation into normal binary notation, add 0xFFF to the return result.
    ' https://support.microsoft.com/en-us/kb/272138
    If json_AscCode < 0 Then
        json_AscCode = json_AscCode + 65536
    End If

    ' From spec, ", \, and control characters must be escaped (solidus is optional)

    Select Case json_AscCode
    Case 34
        ' " -> 34 -> \"
        json_Char = "\"""
    Case 92
        ' \ -> 92 -> \\
        json_Char = "\\"
    Case 47
        ' / -> 47 -> \/ (optional)
        If JsonOptions.EscapeSolidus Then
            json_Char = "\/"
        End If
    Case 8
        ' backspace -> 8 -> \b
        json_Char = "\b"
    Case 12
        ' form feed -> 12 -> \f
        json_Char = "\f"
    Case 10
        ' line feed -> 10 -> \n
        json_Char = "\n"
    Case 13
        ' carriage return -> 13 -> \r
        json_Char = "\r"
    Case 9
        ' tab -> 9 -> \t
        json_Char = "\t"
    Case 1025
      jso_Char = "Ё"
      Case 1040
      jso_Char = "А"
      Case 1041
      jso_Char = "Б"
      Case 1042
      jso_Char = "В"
      Case 1043
      jso_Char = "Г"
      Case 1044
      jso_Char = "Д"
      Case 1045
      jso_Char = "Е"
      Case 1046
      jso_Char = "Ж"
      Case 1047
      jso_Char = "З"
      Case 1048
      jso_Char = "И"
      Case 1049
      jso_Char = "Й"
      Case 1050
      jso_Char = "К"
      Case 1051
      jso_Char = "Л"
      Case 1052
      jso_Char = "М"
      Case 1053
      jso_Char = "Н"
      Case 1054
      jso_Char = "О"
      Case 1055
      jso_Char = "П"
      Case 1056
      jso_Char = "Р"
      Case 1057
      jso_Char = "С"
      Case 1058
      jso_Char = "Т"
      Case 1059
      jso_Char = "У"
      Case 1060
      jso_Char = "Ф"
      Case 1061
      jso_Char = "Х"
      Case 1062
      jso_Char = "Ц"
      Case 1063
      jso_Char = "Ч"
      Case 1064
      jso_Char = "Ш"
      Case 1065
      jso_Char = "Щ"
      Case 1066
      jso_Char = "Ъ"
      Case 1067
      jso_Char = "Ы"
      Case 1068
      jso_Char = "Ь"
      Case 1069
      jso_Char = "Э"
      Case 1070
      jso_Char = "Ю"
      Case 1071
      jso_Char = "Я"
      Case 1072
      jso_Char = "а"
      Case 1073
      jso_Char = "б"
      Case 1074
      jso_Char = "в"
      Case 1075
      jso_Char = "г"
      Case 1076
      jso_Char = "д"
      Case 1077
      jso_Char = "е"
      Case 1078
      jso_Char = "ж"
      Case 1079
      jso_Char = "з"
      Case 1080
      jso_Char = "и"
      Case 1081
      jso_Char = "й"
      Case 1082
      jso_Char = "к"
      Case 1083
      jso_Char = "л"
      Case 1084
      jso_Char = "м"
      Case 1085
      jso_Char = "н"
      Case 1086
      jso_Char = "о"
      Case 1087
      jso_Char = "п"
      Case 1088
      jso_Char = "р"
      Case 1089
      jso_Char = "с"
      Case 1090
      jso_Char = "т"
      Case 1091
      jso_Char = "у"
      Case 1092
      jso_Char = "ф"
      Case 1093
      jso_Char = "х"
      Case 1094
      jso_Char = "ц"
      Case 1095
      jso_Char = "ч"
      Case 1096
      jso_Char = "ш"
      Case 1097
      jso_Char = "щ"
      Case 1098
      jso_Char = "ъ"
      Case 1099
      jso_Char = "ы"
      Case 1100
      jso_Char = "ь"
      Case 1101
      jso_Char = "э"
      Case 1102
      jso_Char = "ю"
      Case 1103
      jso_Char = "я"
      Case 1105
      jso_Char = "ё"
    Case 0 To 31, 127 To 65535
        ' Non-ascii characters -> convert to 4-digit hex
        json_Char = "\u" & VBA.Right$("0000" & VBA.Hex$(json_AscCode), 4)
    End Select

    json_BufferAppend json_Buffer, json_Char, json_BufferPosition, json_BufferLength
Next json_Index

json_Encode = json_BufferToString(json_Buffer, json_BufferPosition)

End Function

json_Encodegets called for every character of a string (including keys). So a long list of character translations is going to give a performance hit. You could have avoided this by using a range test and moving the translations to a function:
Case 1025 To 1105
json_Char = Cyrillic(json_AscCode)
Case …

However, you missed thatjson_Charis already the character that you need:
Case 1025 To 1105
Case …

But this is a long-standing issue - see issues #238 and #37 and pull request #168.

=============
if the above solves your problem, please close your issue here.

Correction 1: json_Encodeis not currently called for keys. It clearly should be and pull request #122 covers that.

Correction 2: json_Encodeis not called for every string character. It is the loop inside which goes one by one.

My comment about a long list of Cases can also be applied to the current code. Several optimisations are possible.

  • Firstly the function should test rapidly if the text string only contains ASCII non-control and non-escaped characters. If so, the text string can be returned immediately (without the loop and without buffering).
  • In the Select, the most common case (no encoding needed) should be tested as early as possible.
  • There are faster ways of handling the ASCII escaped characters.
  • The function uses buffering to cope with the expansion of the text string. A pre-allocated buffer of at least the text size will avoid many buffer extensions.

If implemented on top of pull request #168, the function becomes:

Private Function json_Encode(ByRef json_Text As Variant) As String
    ' Reference: http://www.ietf.org/rfc/rfc4627.txt
    ' Escape: ", \, /, backspace, form feed, line feed, carriage return, tab
    Dim json_Index As Long
    Dim json_Char As String
    Dim json_AscCode As Long
    Dim json_Buffer As String
    Dim json_BufferPosition As Long
    Dim json_BufferLength As Long
    Dim json_CodeLimit As Long
    Dim json_Escapes As String
    Const HasNonASCgraphic As String = "*[! -~]*"
        
    ' Control escaping
        ' backspace         -> 8  -> \b
        ' tab               -> 9  -> \t
        ' line feed         -> 10 -> \n
        ' (vert tab         -> 11 -> \v - not in standard)
        ' form feed         -> 12 -> \f
        ' carriage return   -> 13 -> \r
        ' DEL and C1 controls (rarely used) are treated like other high Unicode characters
    Const ControlXlate = "btnvfr"
    
    ' Check if any encoding needed
    json_Escapes = """\" & IIf(JsonOptions.EscapeSolidus, "/", vbNullString)
    If Not (json_Text Like HasNonASCgraphic Or json_Text Like "*[" & json_Escapes & "]*") Then
        json_Encode = json_Text
        Exit Function
    End If

    ' We have to scan the text and encode (expand) one or more characters.
    ' We already know the minimum size of the buffer required.
    ' Guess that the expansion will be less than 20 characters.
    ' Pre-allocate the buffer.
    json_BufferLength = VBA.Len(json_Text) + 20
    json_Buffer = VBA.Space$(json_BufferLength)
    
    json_CodeLimit = IIf(JsonOptions.AllowUnicodeChars, 65535, 126)
    For json_Index = 1 To VBA.Len(json_Text)
        json_Char = VBA.Mid$(json_Text, json_Index, 1)
        json_AscCode = VBA.AscW(json_Char)

        ' When AscW returns a negative number, it returns the twos complement form of that number.
        ' To convert the twos complement notation into normal binary notation, add 0xFFF to the return result.
        ' https://support.microsoft.com/en-us/kb/272138
        If json_AscCode < 0 Then
            json_AscCode = json_AscCode + 65536
        End If

        ' From spec, ", \, and control characters must be escaped (solidus is optional)

        Select Case json_AscCode
            ' No special action for UTF-16 surrogate pairs (&HDB00 To &HDFFF)
        Case 32 To json_CodeLimit
            If VBA.InStr(json_Escapes, json_Char) <> 0 Then json_Char = "\" & json_Char
        Case 8 To 10, 12 To 13
            json_Char = "\" & VBA.Mid$(ControlXlate, json_AscCode - 7, 1)
        Case Else
            ' Non-ascii characters etc -> convert to 4-digit hex
            json_Char = "\u" & VBA.Right$("0000" & VBA.Hex$(json_AscCode), 4)
        End Select

        json_BufferAppend json_Buffer, json_Char, json_BufferPosition, json_BufferLength
    Next json_Index

    json_Encode = json_BufferToString(json_Buffer, json_BufferPosition)
End Function