Correct field projection not working
Fokko opened this issue · 0 comments
Fokko commented
In Iceberg the tables are projected using field-IDs. Even if the column is renamed (and Iceberg is lazy, so existing data will not be rewritten), it should correct map the name.
Example:
from pyiceberg.catalog.sql import SqlCatalog
catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}")
catalog.create_namespace("default")
import pyarrow as pa
pa_schema = pa.schema([
pa.field('animal', pa.string(), nullable=False),
pa.field('n_legs', pa.int32(), nullable=False),
])
# Empty table
tbl = catalog.create_table("default.test_id_projection", schema=pa_schema)
# Write some data
tbl.append(
df=pa.Table.from_arrays([pa.array(["Flamingo", "Horse"]), pa.array([2, 4])], schema=pa_schema),
)
# Rename the column
with tbl.update_schema() as upd:
upd.rename_column("n_legs", "number_of_legs")
# Write more data
tbl.append(
df=pa.Table.from_arrays(
[pa.array(["Brittle stars", "Centipede"]), pa.array([5, 100])],
schema=pa.schema([
pa.field('animal', pa.string(), nullable=False),
pa.field('number_of_legs', pa.int32(), nullable=False),
]),
),
)
location = tbl.metadata_location
import duckdb
duckdb.sql('INSTALL iceberg; LOAD iceberg;')
result = duckdb.sql(
f"""
SELECT *
FROM iceberg_scan('{location}')
"""
).fetchall()
assert result == [
('Flamingo', 2),
('Horse', 4),
('Brittle stars', 5),
('Centipede', 100),
]
This fails with:
> ).fetchall()
E duckdb.duckdb.IOException: IO Error: Failed to read file "//private/var/folders/22/yb9h2zd55ql37h4_50xkmg7r0000gn/T/pytest-of-fokkodriesprong/pytest-8/test_sql0/default.db/test_id_projection/data/00000-0-d24c398f-f93f-420c-82d4-2a66b72c580e.parquet": schema mismatch in glob: column "number_of_legs" was read from the original file "//private/var/folders/22/yb9h2zd55ql37h4_50xkmg7r0000gn/T/pytest-of-fokkodriesprong/pytest-8/test_sql0/default.db/test_id_projection/data/00000-0-f2b7f973-f7df-4cd6-aa7a-78176b3c3d7f.parquet", but could not be found in file "//private/var/folders/22/yb9h2zd55ql37h4_50xkmg7r0000gn/T/pytest-of-fokkodriesprong/pytest-8/test_sql0/default.db/test_id_projection/data/00000-0-d24c398f-f93f-420c-82d4-2a66b72c580e.parquet".
E Candidate names: animal, n_legs
E If you are trying to read files with different schemas, try setting union_by_name=True