ClickHouse/ClickBench

Parquet file doesn't mark date and timestamp columns

dcory opened this issue · 3 comments

dcory commented

The Parquet file metadata doesn't include logical type information that (for instance) EventDate is a date and EventTime is a timestamp.

https://github.com/apache/parquet-format/blob/master/LogicalTypes.md

Yes, these Parquet files are just converted from the source data.
It's ok to convert the types during load. Or should we add this information for convenience?

What will be the easiest way to do this?

dcory commented

It would definitely be useful to include this information in the Parquet file so that I don't have to explicitly convert it on load, especially for a stateless table engine.

Same happen with Strings, which are just binaries that are not being marked as Strings.

Apache Pinot is quite strict about that and when importing data from parquet (which is quite faster than using tsv) the data is not imported as UTF-8 but as a byte array (because that is what parquet says)

For example:

> parq hits_01.parquet --schema

 # Schema 
 <pyarrow._parquet.ParquetSchema object at 0x7f1ccc4c4180>
required group field_id=-1 schema {
  optional int64 field_id=-1 WatchID;
  optional int32 field_id=-1 JavaEnable (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 Title;
  optional int32 field_id=-1 GoodEvent (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 EventTime;
  optional int32 field_id=-1 EventDate (Int(bitWidth=16, isSigned=false));
  optional int32 field_id=-1 CounterID;
  optional int32 field_id=-1 ClientIP;
  optional int32 field_id=-1 RegionID;
  optional int64 field_id=-1 UserID;
  optional int32 field_id=-1 CounterClass (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 OS (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 UserAgent (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 URL;
  optional binary field_id=-1 Referer;
  optional int32 field_id=-1 IsRefresh (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RefererCategoryID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RefererRegionID;
  optional int32 field_id=-1 URLCategoryID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 URLRegionID;
  optional int32 field_id=-1 ResolutionWidth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ResolutionHeight (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ResolutionDepth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 FlashMajor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 FlashMinor (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 FlashMinor2;
  optional int32 field_id=-1 NetMajor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 NetMinor (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 UserAgentMajor (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 UserAgentMinor;
  optional int32 field_id=-1 CookieEnable (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 JavascriptEnable (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsMobile (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 MobilePhone (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 MobilePhoneModel;
  optional binary field_id=-1 Params;
  optional int32 field_id=-1 IPNetworkID;
  optional int32 field_id=-1 TraficSourceID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SearchEngineID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 SearchPhrase;
  optional int32 field_id=-1 AdvEngineID (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsArtifical (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WindowClientWidth (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WindowClientHeight (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 ClientTimeZone (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 ClientEventTime;
  optional int32 field_id=-1 SilverlightVersion1 (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SilverlightVersion2 (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SilverlightVersion3;
  optional int32 field_id=-1 SilverlightVersion4 (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 PageCharset;
  optional int32 field_id=-1 CodeVersion;
  optional int32 field_id=-1 IsLink (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsDownload (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsNotBounce (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 FUniqID;
  optional binary field_id=-1 OriginalURL;
  optional int32 field_id=-1 HID;
  optional int32 field_id=-1 IsOldCounter (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsEvent (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 IsParameter (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 DontCountHits (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 WithHash (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 HitColor;
  optional int64 field_id=-1 LocalEventTime;
  optional int32 field_id=-1 Age (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Sex (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Income (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Interests (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 Robotness (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 RemoteIP;
  optional int32 field_id=-1 WindowName;
  optional int32 field_id=-1 OpenerName;
  optional int32 field_id=-1 HistoryLength (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 BrowserLanguage;
  optional binary field_id=-1 BrowserCountry;
  optional binary field_id=-1 SocialNetwork;
  optional binary field_id=-1 SocialAction;
  optional int32 field_id=-1 HTTPError (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 SendTiming;
  optional int32 field_id=-1 DNSTiming;
  optional int32 field_id=-1 ConnectTiming;
  optional int32 field_id=-1 ResponseStartTiming;
  optional int32 field_id=-1 ResponseEndTiming;
  optional int32 field_id=-1 FetchTiming;
  optional int32 field_id=-1 SocialSourceNetworkID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 SocialSourcePage;
  optional int64 field_id=-1 ParamPrice;
  optional binary field_id=-1 ParamOrderID;
  optional binary field_id=-1 ParamCurrency;
  optional int32 field_id=-1 ParamCurrencyID (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 OpenstatServiceName;
  optional binary field_id=-1 OpenstatCampaignID;
  optional binary field_id=-1 OpenstatAdID;
  optional binary field_id=-1 OpenstatSourceID;
  optional binary field_id=-1 UTMSource;
  optional binary field_id=-1 UTMMedium;
  optional binary field_id=-1 UTMCampaign;
  optional binary field_id=-1 UTMContent;
  optional binary field_id=-1 UTMTerm;
  optional binary field_id=-1 FromTag;
  optional int32 field_id=-1 HasGCLID (Int(bitWidth=16, isSigned=true));
  optional int64 field_id=-1 RefererHash;
  optional int64 field_id=-1 URLHash;
  optional int32 field_id=-1 CLID;
}