Apache Spark & Trip Record Data Analysis [Spark Measure]
install homebrew (optional)
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
(echo; echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"') >> /home/romanzini/.bashrc
eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
sudo apt-get install build-essential
brew install gcc
pip install virtualenv
virtualenv venv
source venv/bin/deactivate
sudo apt install openjdk-11-jre-headless
verify local spark installation
pyspark --version
spark-submit --help
http://localhost:4040/jobs/
install required packages
pip install -r requirements.txt
download jars files and mv to config/spark/jars
curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar
curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar
curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar
curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar
mv s3-2.18.41.jar config/spark/jars
mv aws-java-sdk-1.12.367.jar config/spark/jars
mv delta-core_2.12-2.2.0.jar config/spark/jars
mv delta-storage-2.2.0.jar config/spark/jars
create .env file for variables
.env
APP_SRC_PATH=/home/romanzini/projetos/tuning-spark-app/src
APP_STORAGE_PATH=/home/romanzini/projetos/tuning-spark-app/storage
APP_LOG_PATH=/home/romanzini/projetos/tuning-spark-app/logs
APP_METRICS_PATH=/home/romanzini/projetos/tuning-spark-app/metrics
APP_LOGSTASH_PATH=/home/romanzini/projetos/tuning-spark-app/events/
build spark docker images [spark & history server]
docker build -t owshq-spark:3.5 -f Dockerfile.spark .
docker build -t owshq-spark-history-server:3.5 -f Dockerfile.history .
run spark cluster & history server on docker
docker-compose up -d
docker ps
docker logs spark-master
docker logs spark-worker-1
docker logs spark-worker-2
docker logs spark-history-server
download files & save on minio [storage/fhvhv/2022]
https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
download files & save on minio [storage/yelp]
https://www.yelp.com/dataset/download
execute spark application
docker exec -it spark-master /opt/bitnami/spark/bin/spark-submit \
--master spark://spark-master:7077 \
--deploy-mode client \
/opt/bitnami/spark/jobs/elt-rides-fhvhv-py-strawberry.py
access spark history server
http://localhost:8080/
http://localhost:18080/