Support loading Parquet files through ?parquet=
simonw opened this issue · 8 comments
The good news is that this should be possible using fastparquet
which was added in Pyodide 0.23.0 https://pyodide.org/en/stable/project/changelog.html#version-0-23-0
But... Datasette Lite can't be upgraded to that version of Pyodide yet thanks to:
Here's a prototype that didn't work because of that os.link
issue:
diff --git a/index.html b/index.html
index 5419eea..1c29926 100644
--- a/index.html
+++ b/index.html
@@ -114,9 +114,10 @@ const metadataUrl = fixUrl(urlParams.get('metadata'));
const csvUrls = urlParams.getAll('csv').map(fixUrl);
const sqlUrls = urlParams.getAll('sql').map(fixUrl);
const jsonUrls = urlParams.getAll('json').map(fixUrl);
+const parquetUrls = urlParams.getAll('parquet').map(fixUrl);
const installUrls = urlParams.getAll('install');
-datasetteWorker.postMessage({type: 'startup', initialUrl, csvUrls, sqlUrls, jsonUrls, installUrls, metadataUrl});
+datasetteWorker.postMessage({type: 'startup', initialUrl, csvUrls, sqlUrls, jsonUrls, parquetUrls, installUrls, metadataUrl});
let loadingLogs = ["Loading..."];
diff --git a/webworker.js b/webworker.js
index 657f3ec..4996c2b 100644
--- a/webworker.js
+++ b/webworker.js
@@ -1,4 +1,4 @@
-importScripts("https://cdn.jsdelivr.net/pyodide/v0.20.0/full/pyodide.js");
+importScripts("https://cdn.jsdelivr.net/pyodide/v0.23.0/full/pyodide.js");
function log(line) {
console.log({line})
@@ -10,6 +10,7 @@ async function startDatasette(settings) {
let csvs = [];
let sqls = [];
let jsons = [];
+ let parquets = [];
let needsDataDb = false;
let shouldLoadDefaults = true;
if (settings.initialUrl) {
@@ -32,6 +33,11 @@ async function startDatasette(settings) {
needsDataDb = true;
shouldLoadDefaults = false;
}
+ if (settings.parquetUrls && settings.parquetUrls.length) {
+ parquets = settings.parquetUrls;
+ needsDataDb = true;
+ shouldLoadDefaults = false;
+ }
if (needsDataDb) {
toLoad.push(["data.db", 0]);
}
@@ -40,7 +46,8 @@ async function startDatasette(settings) {
toLoad.push(["content.db", "https://datasette.io/content.db"]);
}
self.pyodide = await loadPyodide({
- indexURL: "https://cdn.jsdelivr.net/pyodide/v0.20.0/full/"
+ indexURL: "https://cdn.jsdelivr.net/pyodide/v0.23.0/full/",
+ fullStdLib: true
});
await pyodide.loadPackage('micropip', log);
await pyodide.loadPackage('ssl', log);
@@ -92,7 +99,8 @@ async function startDatasette(settings) {
# Import data from ?csv=URL CSV files/?json=URL JSON files
csvs = ${JSON.stringify(csvs)}
jsons = ${JSON.stringify(jsons)}
- if csvs or jsons:
+ parquets = ${JSON.stringify(parquets)}
+ if csvs or jsons or parquets:
await micropip.install("sqlite-utils==3.28")
import sqlite_utils, json
from sqlite_utils.utils import rows_from_file, TypeTracker, Format
@@ -157,7 +165,26 @@ async function startDatasette(settings) {
break
assert isinstance(json_data, list), "JSON data must be a list of objects"
db[bit].insert_all(json_data, pk=pk)
-
+ if parquets:
+ await micropip.install("fastparquet")
+ import fastparquet
+ for parquet_url in parquets:
+ # Derive table name from parquet URL
+ bit = parquet_url.split("/")[-1].split(".")[0].split("?")[0]
+ bit = bit.strip()
+ if not bit:
+ bit = "table"
+ prefix = 0
+ base_bit = bit
+ while bit in table_names:
+ prefix += 1
+ bit = "{}_{}".format(base_bit, prefix)
+ table_names.add(bit)
+ response = await pyfetch(parquet_url)
+ with open("parquet.parquet", "wb") as fp:
+ fp.write(await response.bytes())
+ df = fastparquet.ParquetFile("parquet.parquet").to_pandas()
+ db[bit].insert_all(df.to_dict(orient="records"))
from datasette.app import Datasette
ds = Datasette(names, settings={
"num_sql_threads": 0,
I should improve that prototype by fixing the duplicate logic for finding the table names.
Experiment I did in https://pyodide.org/en/stable/console.html
Python 3.11.2 (main, May 3 2023 04:00:05) on WebAssembly/Emscripten
Type "help", "copyright", "credits" or "license" for more information.
>>> import micropip
>>> await micropip.install("fastparquet")
>>> import fastparquet
>>> from pyodide.http import pyfetch
>>> r = await pyfetch("https://raw.githubusercontent.com/kaysush/sample-parquet-files/main/part-00000-a9e77425-5fb4-456f-
ba52-f821123bd193-c000.snappy.parquet")
>>> open("blah.snappy.parquet", "wb").write(await r.bytes())
69287
>>> fp = fastparquet.ParquetFile("blah.snappy.parquet")
>>> fp.columns
['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'cc', 'country', 'birthdate', 'salary', 'title', 'comm
ents']
>>> fp.to_pandas()
id first_name last_name ... salary title comments
0 1 Amanda Jordan ... 49756.53 Internal Auditor 1E+02
1 2 Albert Freeman ... 150280.17 Accountant IV
2 3 Evelyn Morgan ... 144972.51 Structural Engineer
3 4 Denise Riley ... 90263.05 Senior Cost Accountant
4 5 Carlos Burns ... NaN
.. ... ... ... ... ... ... ...
995 996 Dennis Harris ... 263399.54 Editor
996 997 Gloria Hamilton ... 83183.54 VP Product Management
997 998 Nancy Morris ... NaN Junior Executive
998 999 Annie Daniels ... 18433.85 Editor
999 1000 Julie Meyer ... 222561.13
[1000 rows x 12 columns]
>>>
Got a workaround for the os.link
issue:
import os
os.link = os.symlink
From:
This example file: https://github.com/Teradata/kylo/blob/master/samples/sample-data/parquet/userdata1.parquet
Gave me this error:
Traceback (most recent call last):
File "/lib/python311.zip/_pyodide/_base.py", line 540, in eval_code_async
await CodeRunner(
File "/lib/python311.zip/_pyodide/_base.py", line 365, in run_async
await coroutine
File "<exec>", line 112, in <module>
File "/lib/python3.11/site-packages/sqlite_utils/db.py", line 2970, in insert_all
self.create(
File "/lib/python3.11/site-packages/sqlite_utils/db.py", line 1469, in create
self.db.create_table(
File "/lib/python3.11/site-packages/sqlite_utils/db.py", line 889, in create_table
sql = self.create_table_sql(
^^^^^^^^^^^^^^^^^^^^^^
File "/lib/python3.11/site-packages/sqlite_utils/db.py", line 835, in create_table_sql
column_type=COLUMN_TYPE_MAPPING[column_type],
~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^
KeyError: <class 'pandas._libs.tslibs.timestamps.Timestamp'>
Using df.to_sql()
instead of loading dicts into SQLite seems to have fixed that.