arrow_fdw: timestamp型にインデックスを設定すると結果が不正となる。
Closed this issue · 4 comments
0-kaz commented
timestamp型にインデックスをつけてarrowファイルをエクスポート→arrow_fdwで参照すると、スキップしてはいけないチャンクをスキップしてしまい、本来表示されるべき行が表示されない。
発生コミット:27576baa9c3cbd477966ed077311705fffff107e
全体クエリ
CREATE EXTENSION IF NOT EXISTS pg_strom;
DROP TABLE iF EXISTS arrow_index_data;
CREATE TABLE arrow_index_data (
id int,
int_num int,
float_num float,
half_num float2,
decimal_num numeric,
date_num date,
time_num time,
timestamp_num timestamp
);
INSERT INTO arrow_index_data (
SELECT x, -- int_num
pgstrom.random_int(0, -16777216, 16777216), -- int_num
pgstrom.random_float(0,-10000.0,10000.0), -- float_num
pgstrom.random_float(0,-10000.0,10000.0), -- half_num
pgstrom.random_float(0,-10000.0,10000.0), -- decimal_num
pgstrom.random_date(0), -- date_num
pgstrom.random_time(0), -- time_num
pgstrom.random_timestamp(0) -- date_num
FROM generate_series(1,1000000) x);
\! /home/onishi/pg-strom/arrow-tools/pg2arrow -s 16m --set=timezone:Asia/Tokyo -c 'SELECT * FROM public.arrow_index_data ORDER BY timestamp_num' -o /tmp/test_arrow_index.data --stat=timestamp_num -d postgres
DROP FOREIGN TABLE IF EXISTS regtest_arrow;
IMPORT FOREIGN SCHEMA regtest_arrow
FROM SERVER arrow_fdw
INTO public
OPTIONS (file '/tmp/test_arrow_index.data');
EXPLAIN (ANALYZE,COSTS OFF,TIMING OFF,SUMMARY OFF)
SELECT count(*)
FROM regtest_arrow
WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
--エクスポート元テーブルの結果を確認
SELECT count(*)
FROM arrow_index_data
WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
-- 結果は ↑と一致するはずだが、0行となる
SELECT count(*)
FROM regtest_arrow
WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
結果
postgres=# EXPLAIN (ANALYZE,COSTS OFF,TIMING OFF,SUMMARY OFF)
postgres-# SELECT count(*)
postgres-# FROM regtest_arrow
postgres-# WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (actual rows=1 loops=1)
-> Custom Scan (GpuPreAgg) on regtest_arrow (actual rows=0 loops=1)
GPU Projection: pgstrom.nrows()
GPU Scan Quals: ((timestamp_num >= '2019-04-14 09:00:00'::timestamp without time zone) AND (timestamp_num <= '2023-05-23 17:00:00'::timestamp without time zone)) [plan: 1000000 -> 5000, exec: 0 -> 0]
referenced: timestamp_num
Stats-Hint: (timestamp_num >= '2019-04-14 09:00:00'::timestamp without time zone), (timestamp_num <= '2023-05-23 17:00:00'::timestamp without time zone) [loaded: 0, skipped: 4]
file0: /tmp/test_arrow_index.data (read: 7812.69KB, size: 51.50MB)
GPU-Direct SQL: disabled (GPU-0; ntuples=0)
(8 rows)
postgres=#
postgres=# --エクスポート元テーブルの結果を確認
postgres=# SELECT count(*)
postgres-# FROM arrow_index_data
postgres-# WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
count
--------
410657
(1 row)
postgres=#
postgres=# -- 結果は ↑と一致するはずだが、0行となる
postgres=# SELECT count(*)
postgres-# FROM regtest_arrow
postgres-# WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
count
-------
0
(1 row)
0-kaz commented
pg2arrow --dumpの結果
/home/onishi/pg-strom/arrow-tools/pg2arrow --dump=/tmp/test_arrow_index.data
[Footer]
{Footer: version=V4, schema={Schema: endianness=little, fields=[{Field: name="id", nullable=true, type={Int32}, children=[], custom_metadata=[]}, {Field: name="int_num", nullable=true, type={Int32}, children=[], custom_metadata=[]}, {Field: name="float_num", nullable=true, type={Float64}, children=[], custom_metadata=[]}, {Field: name="half_num", nullable=true, type={Float16}, children=[], custom_metadata=[]}, {Field: name="decimal_num", nullable=true, type={Decimal: precision=30, scale=8, bitWidth=128}, children=[], custom_metadata=[]}, {Field: name="date_num", nullable=true, type={Date: unit=day}, children=[], custom_metadata=[]}, {Field: name="time_num", nullable=true, type={Time: unit=us, bitWidth=64}, children=[], custom_metadata=[]}, {Field: name="timestamp_num", nullable=true, type={Timestamp: unit=us}, children=[], custom_metadata=[{KeyValue: key="min_values" value="1420070744854544,1517975422526492,1616158091214694,1714191403018428"}, {KeyValue: key="max_values" value="1517974702972265,1616157398516247,1714191129294808,1735689542829419"}]}], custom_metadata=[{KeyValue: key="sql_command" value="SELECT * FROM public.arrow_index_data ORDER BY timestamp_num"}], features=[Unused]}, dictionaries=[], recordBatches=[{Block: offset=928, metaDataLength=488 bodyLength=16777664}, {Block: offset=16779080, metaDataLength=488 bodyLength=16777664}, {Block: offset=33557232, metaDataLength=488 bodyLength=16777664}, {Block: offset=50335384, metaDataLength=488 bodyLength=3668544}]}
[Record Batch 0]
{Block: offset=928, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 1]
{Block: offset=16779080, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 2]
{Block: offset=33557232, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 3]
{Block: offset=50335384, metaDataLength=488 bodyLength=3668544}
{Message: version=V4, body={RecordBatch: length=67933, nodes=[{FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=271744}, {Buffer: offset=271744, length=0}, {Buffer: offset=271744, length=271744}, {Buffer: offset=543488, length=0}, {Buffer: offset=543488, length=543488}, {Buffer: offset=1086976, length=0}, {Buffer: offset=1086976, length=135872}, {Buffer: offset=1222848, length=0}, {Buffer: offset=1222848, length=1086976}, {Buffer: offset=2309824, length=0}, {Buffer: offset=2309824, length=271744}, {Buffer: offset=2581568, length=0}, {Buffer: offset=2581568, length=543488}, {Buffer: offset=3125056, length=0}, {Buffer: offset=3125056, length=543488}]}, bodyLength=3668544}
0-kaz commented
PGver 16.2で確認しています。
が、もともとver15でも発生していたようです。
pg-strom/test/15/expected/arrow_index.out
Line 415 in 27576ba
kaigai commented
修正しました。
統計情報のヒント、Arrowに埋め込む際にタイムスタンプをArrowに合わせてUNIX EPOCH起点に調節しているのですが、
Stats-Hintとして使用する際にPostgreSQL EPOCH(2000-01-01)に調節していませんでした。
0-kaz commented
修正確認できました。ありがとうございました。