default_args:
owner: nguyenvantuan140397@gmail.com
start_date: 2024-06-28
schedule_interval: "0 2 * * *"
tasks:
score_products:
operator: BigQueryOperator
sql: SELECT * FROM `p-dwh.dim.dim_product`
destination_dataset_table: "p-dwh.dim.test_{{ macros.localtz.ds_nodash(ti) }}"
update_realtime_cached:
operator: BigqueryToCassandraOperator
cassandra_dest_table: "dwh.dim_product"
sql: |
SELECT CAST(sku AS STRING) sku, product_fraud_score_new score
FROM `p-dwh.dim.dim_product`
WHERE sku IS NOT NULL
dependencies: ["score_products"]
Operator |
---|
BigqueryToSheetOperator |
BigQueryToGCSOperator |
SFTPGetMultipleFilesOperator |
Install Docker vs Docker Compose
Setup Env Var:
export AIRFLOW_UID=50000
Build docker with docker-compose
docker-compose build
Run docker
docker-compose up
View Home Page
Variable | Default value | Role |
---|---|---|
ARIFLOW USER |
airflow |
admin |
ARIFLOW_PASS |
airflow |
admin |
Edit image name
Edit image version
Edit docker username
- bash build_docker.sh
Install python3.8 with virtualenv first
# in virtualenv
# install core airflow
python3 -m pip install -r requirements_airflow.txt --constraint ./constraints.txt --use-deprecated=legacy-resolver
# spark
python3 -m pip install -r ./requirements_nodeps.txt --constrain ./constraints.txt --no-deps --use-deprecated=legacy-resolver
# extra libs used in airlake
python3 -m pip install -r requirements.txt --constraint ./constraints.txt --use-deprecated=legacy-resolver
Setup Local Airflow
setup database
export AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@127.0.0.1:5432/airflow
airflow db init
Scheduler
airflow scheduler
WebServer
airflow webserver
The project has a separate contribution file. Please adhere to the steps listed in the separate contributions file
This project is licensed under the Apache License, Version 2.0. See the LICENSE file for more details.
- Email: nguyenvantuan140397@gmail.com
- Tele: Tuancamtbtx
Please DO NOT edit code and data in this project