Base on The Complete dbt (Data Build Tool) Bootcamp but with usig ClickHouse as a datastore.
CREATE DATABASE airbnb ON CLUSTER `{cluster}`;
CREATE TABLE airbnb.raw_listings_local ON CLUSTER `{cluster}`
(
`id` Int32,
`listing_url` String,
`name` String,
`room_type` String,
`minimum_nights` Int32,
`host_id` Int32,
`price` String,
`created_at` DateTime,
`updated_at` DateTime
)
ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{cluster}/{shard}/airbnb/raw_listings', '{replica}')
ORDER BY id
;
CREATE TABLE airbnb.raw_listings ON CLUSTER '{cluster}' AS airbnb.raw_listings_local
ENGINE = Distributed('{cluster}', 'airbnb', 'raw_listings_local', intHash64(id))
;
CREATE TABLE IF NOT EXISTS airbnb.raw_reviews_local ON CLUSTER '{cluster}'
(
listing_id Int32,
date DateTime,
reviewer_name String,
comments String,
sentiment String
) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{cluster}/{shard}/airbnb/raw_reviews', '{replica}')
ORDER BY listing_id
;
CREATE TABLE IF NOT EXISTS airbnb.raw_reviews ON CLUSTER '{cluster}' AS airbnb.raw_reviews_local
ENGINE = Distributed('{cluster}', 'airbnb', 'raw_reviews_local', intHash64(listing_id))
;
CREATE TABLE IF NOT EXISTS airbnb.raw_hosts_local ON CLUSTER '{cluster}'
(
id Int32,
name String,
is_superhost String,
created_at DateTime,
updated_at DateTime
) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{cluster}/{shard}/airbnb/raw_hosts', '{replica}')
ORDER BY id
;
CREATE TABLE IF NOT EXISTS airbnb.raw_hosts ON CLUSTER '{cluster}'
AS airbnb.raw_hosts_local
ENGINE = Distributed('{cluster}', 'airbnb', 'raw_hosts_local', intHash64(id))
;
SET date_time_input_format='best_effort';
INSERT INTO airbnb.raw_listings_local SELECT *
FROM s3('s3://dbtlearn/listings.csv', 'CSV')
;
INSERT INTO airbnb.raw_reviews SELECT *
FROM s3('s3://dbtlearn/reviews.csv', 'CSV')
;
INSERT INTO airbnb.raw_hosts SELECT *
FROM s3('s3://dbtlearn/hosts.csv', 'CSV')
;