Windows encoding error when writing sequence of variable length UTF-8 characters
mattjala opened this issue · 2 comments
Test to replicate the error (on windows-latest
or Windows 10):
def testPutVLenUTF8(self):
# Test PUT value for 1d dataset with vlen seq of vlen utf-8 strings
print("testPutVLenUTF8", self.base_domain)
headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"
# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)
# create dataset
vlen_utf8_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": "H5T_VARIABLE",
"strPad": "H5T_STR_NULLPAD",
}
datatype = {"class": "H5T_VLEN", "base": vlen_utf8_type}
payload = {
"type": datatype,
"shape": "H5S_SCALAR",
}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(payload), headers=headers)
self.assertEqual(rsp.status_code, 201) # create dataset
rspJson = json.loads(rsp.text)
dset_uuid = rspJson["id"]
self.assertTrue(helper.validateId(dset_uuid))
# link new dataset as 'dset'
name = "dset"
req = self.endpoint + "/groups/" + root_uuid + "/links/" + name
payload = {"id": dset_uuid}
rsp = self.session.put(req, data=json.dumps(payload), headers=headers)
self.assertEqual(rsp.status_code, 201)
data = u"one: \u4e00"
payload = {"value": data}
req = self.endpoint + "/datasets/" + dset_uuid + "/value"
rsp = self.session.put(req, data=json.dumps(payload), headers=headers)
self.assertEqual(rsp.status_code, 200)
Output from HSDS:
Error handling request
Traceback (most recent call last):
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\hsds\util\arrayUtil.py", line 419, in readElement
e = np.frombuffer(bytes(e_buffer), dtype=vlen)
ValueError: cannot create an OBJECT array from memory buffer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\aiohttp\web_protocol.py", line 452, in _handle_request
resp = await request_handler(request)
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\aiohttp\web_app.py", line 543, in _handle
resp = await handler(request)
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\hsds\chunk_dn.py", line 277, in PUT_Chunk
input_arr = bytesToArray(input_bytes, select_dt, [num_elements, ])
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\hsds\util\arrayUtil.py", line 500, in bytesToArray
offset = readElement(data, offset, arr, index, dt)
File "C:\Users\vboxuser\AppData\Local\Programs\Python\Python310\lib\site-packages\hsds\util\arrayUtil.py", line 422, in readElement
raise ValueError(msg)
ValueError: e_buffer: b'one: \xe4\xb8\x80', dtype: object
got exception: 'charmap' codec can't encode character '\u4e00' in position 36: character maps to <undefined>
shutting down server
This looks like another case of Windows defaulting to the 1252 encoding instead of UTF-8.
Adding Content-Type: application/json; charset=utf-8
to the requests headers before the request is made fixes this.
If clients want their data to be parsed correctly, it seem they'll have to do this on their side, since the decoding occurs when request.json()
is invoked, and that doesn't take an encoding parameter after the server receives it.
Regardless, HSDS should certainly be able to handle this more gracefully. Wrapping request.json()
in a try-except block doesn't prevent an exception from being raised to the HSDS application level, which is strange.