heterodb/pg-strom

arrow_fdw: timestamp型にインデックスを設定すると結果が不正となる。

Closed this issue · 4 comments

timestamp型にインデックスをつけてarrowファイルをエクスポート→arrow_fdwで参照すると、スキップしてはいけないチャンクをスキップしてしまい、本来表示されるべき行が表示されない。

発生コミット:27576baa9c3cbd477966ed077311705fffff107e

全体クエリ

CREATE EXTENSION IF NOT EXISTS pg_strom;
DROP TABLE iF EXISTS arrow_index_data;
CREATE TABLE arrow_index_data (
  id         int,
  int_num         int,
  float_num       float,
  half_num        float2,
  decimal_num     numeric,
  date_num        date,
  time_num        time,
  timestamp_num   timestamp
);

INSERT INTO arrow_index_data (
    SELECT x,   -- int_num
        pgstrom.random_int(0, -16777216, 16777216),   -- int_num
        pgstrom.random_float(0,-10000.0,10000.0),        -- float_num
        pgstrom.random_float(0,-10000.0,10000.0),        -- half_num
        pgstrom.random_float(0,-10000.0,10000.0),        -- decimal_num
        pgstrom.random_date(0),            -- date_num
        pgstrom.random_time(0),              -- time_num
        pgstrom.random_timestamp(0)                     -- date_num
    FROM generate_series(1,1000000) x);


\! /home/onishi/pg-strom/arrow-tools/pg2arrow -s 16m --set=timezone:Asia/Tokyo -c 'SELECT * FROM public.arrow_index_data ORDER BY timestamp_num' -o /tmp/test_arrow_index.data --stat=timestamp_num -d postgres
DROP FOREIGN TABLE IF EXISTS regtest_arrow;
IMPORT FOREIGN SCHEMA regtest_arrow
  FROM SERVER arrow_fdw
  INTO public
OPTIONS (file '/tmp/test_arrow_index.data');


EXPLAIN (ANALYZE,COSTS OFF,TIMING OFF,SUMMARY OFF)
SELECT count(*)
  FROM regtest_arrow
 WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';

--エクスポート元テーブルの結果を確認
SELECT count(*)
  FROM arrow_index_data
 WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';

-- 結果は ↑と一致するはずだが、0行となる
SELECT count(*)
  FROM regtest_arrow
 WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';

結果

postgres=# EXPLAIN (ANALYZE,COSTS OFF,TIMING OFF,SUMMARY OFF)
postgres-# SELECT count(*)
postgres-#   FROM regtest_arrow
postgres-#  WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
                                                                                                   QUERY PLAN                                                                                                    
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate (actual rows=1 loops=1)
   ->  Custom Scan (GpuPreAgg) on regtest_arrow (actual rows=0 loops=1)
         GPU Projection: pgstrom.nrows()
         GPU Scan Quals: ((timestamp_num >= '2019-04-14 09:00:00'::timestamp without time zone) AND (timestamp_num <= '2023-05-23 17:00:00'::timestamp without time zone)) [plan: 1000000 -> 5000, exec: 0 -> 0]
         referenced: timestamp_num
         Stats-Hint: (timestamp_num >= '2019-04-14 09:00:00'::timestamp without time zone), (timestamp_num <= '2023-05-23 17:00:00'::timestamp without time zone)  [loaded: 0, skipped: 4]
         file0: /tmp/test_arrow_index.data (read: 7812.69KB, size: 51.50MB)
         GPU-Direct SQL: disabled (GPU-0; ntuples=0)
(8 rows)

postgres=# 
postgres=# --エクスポート元テーブルの結果を確認
postgres=# SELECT count(*)
postgres-#   FROM arrow_index_data
postgres-#  WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
 count  
--------
 410657
(1 row)

postgres=# 
postgres=# -- 結果は ↑と一致するはずだが、0行となる
postgres=# SELECT count(*)
postgres-#   FROM regtest_arrow
postgres-#  WHERE timestamp_num between '2019-04-14 09:00:00' and '2023-05-23 17:00:00';
 count 
-------
     0
(1 row)

pg2arrow --dumpの結果

/home/onishi/pg-strom/arrow-tools/pg2arrow --dump=/tmp/test_arrow_index.data 
[Footer]
{Footer: version=V4, schema={Schema: endianness=little, fields=[{Field: name="id", nullable=true, type={Int32}, children=[], custom_metadata=[]}, {Field: name="int_num", nullable=true, type={Int32}, children=[], custom_metadata=[]}, {Field: name="float_num", nullable=true, type={Float64}, children=[], custom_metadata=[]}, {Field: name="half_num", nullable=true, type={Float16}, children=[], custom_metadata=[]}, {Field: name="decimal_num", nullable=true, type={Decimal: precision=30, scale=8, bitWidth=128}, children=[], custom_metadata=[]}, {Field: name="date_num", nullable=true, type={Date: unit=day}, children=[], custom_metadata=[]}, {Field: name="time_num", nullable=true, type={Time: unit=us, bitWidth=64}, children=[], custom_metadata=[]}, {Field: name="timestamp_num", nullable=true, type={Timestamp: unit=us}, children=[], custom_metadata=[{KeyValue: key="min_values" value="1420070744854544,1517975422526492,1616158091214694,1714191403018428"}, {KeyValue: key="max_values" value="1517974702972265,1616157398516247,1714191129294808,1735689542829419"}]}], custom_metadata=[{KeyValue: key="sql_command" value="SELECT * FROM public.arrow_index_data ORDER BY timestamp_num"}], features=[Unused]}, dictionaries=[], recordBatches=[{Block: offset=928, metaDataLength=488 bodyLength=16777664}, {Block: offset=16779080, metaDataLength=488 bodyLength=16777664}, {Block: offset=33557232, metaDataLength=488 bodyLength=16777664}, {Block: offset=50335384, metaDataLength=488 bodyLength=3668544}]}
[Record Batch 0]
{Block: offset=928, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 1]
{Block: offset=16779080, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 2]
{Block: offset=33557232, metaDataLength=488 bodyLength=16777664}
{Message: version=V4, body={RecordBatch: length=310689, nodes=[{FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}, {FieldNode: length=310689, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=1242816}, {Buffer: offset=1242816, length=0}, {Buffer: offset=1242816, length=1242816}, {Buffer: offset=2485632, length=0}, {Buffer: offset=2485632, length=2485568}, {Buffer: offset=4971200, length=0}, {Buffer: offset=4971200, length=621440}, {Buffer: offset=5592640, length=0}, {Buffer: offset=5592640, length=4971072}, {Buffer: offset=10563712, length=0}, {Buffer: offset=10563712, length=1242816}, {Buffer: offset=11806528, length=0}, {Buffer: offset=11806528, length=2485568}, {Buffer: offset=14292096, length=0}, {Buffer: offset=14292096, length=2485568}]}, bodyLength=16777664}
[Record Batch 3]
{Block: offset=50335384, metaDataLength=488 bodyLength=3668544}
{Message: version=V4, body={RecordBatch: length=67933, nodes=[{FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}, {FieldNode: length=67933, null_count=0}], buffers=[{Buffer: offset=0, length=0}, {Buffer: offset=0, length=271744}, {Buffer: offset=271744, length=0}, {Buffer: offset=271744, length=271744}, {Buffer: offset=543488, length=0}, {Buffer: offset=543488, length=543488}, {Buffer: offset=1086976, length=0}, {Buffer: offset=1086976, length=135872}, {Buffer: offset=1222848, length=0}, {Buffer: offset=1222848, length=1086976}, {Buffer: offset=2309824, length=0}, {Buffer: offset=2309824, length=271744}, {Buffer: offset=2581568, length=0}, {Buffer: offset=2581568, length=543488}, {Buffer: offset=3125056, length=0}, {Buffer: offset=3125056, length=543488}]}, bodyLength=3668544}

PGver 16.2で確認しています。
が、もともとver15でも発生していたようです。

修正しました。

統計情報のヒント、Arrowに埋め込む際にタイムスタンプをArrowに合わせてUNIX EPOCH起点に調節しているのですが、
Stats-Hintとして使用する際にPostgreSQL EPOCH(2000-01-01)に調節していませんでした。

修正確認できました。ありがとうございました。