Apache Spark & Trip Record Data Analysis [Spark Measure]

Roadmap Roadmap Roadmap Rodmap

install homebrew (optional)

/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

(echo; echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"') >> /home/romanzini/.bashrc
eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"

sudo apt-get install build-essential

brew install gcc

verify python

python3 --version

install & create env

pip install virtualenv

virtualenv venv

activate

source venv/bin/activate

deactivate

source venv/bin/deactivate

install java

sudo apt install openjdk-11-jre-headless

verify local spark installation

pyspark --version

spark-submit --help
http://localhost:4040/jobs/

install required packages

pip install -r requirements.txt

download jars files and mv to config/spark/jars

curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar 
curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar 
curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar 
curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar 
mv s3-2.18.41.jar config/spark/jars 
mv aws-java-sdk-1.12.367.jar config/spark/jars 
mv delta-core_2.12-2.2.0.jar config/spark/jars 
mv delta-storage-2.2.0.jar config/spark/jars

create .env file for variables

.env

APP_SRC_PATH=/home/romanzini/projetos/tuning-spark-app/src
APP_STORAGE_PATH=/home/romanzini/projetos/tuning-spark-app/storage
APP_LOG_PATH=/home/romanzini/projetos/tuning-spark-app/logs
APP_METRICS_PATH=/home/romanzini/projetos/tuning-spark-app/metrics
APP_LOGSTASH_PATH=/home/romanzini/projetos/tuning-spark-app/events/

build spark docker images [spark & history server]

docker build -t owshq-spark:3.5 -f Dockerfile.spark . 
docker build -t owshq-spark-history-server:3.5 -f Dockerfile.history .

run spark cluster & history server on docker

docker-compose up -d
docker ps

docker logs spark-master
docker logs spark-worker-1
docker logs spark-worker-2
docker logs spark-history-server

download files & save on minio [storage/fhvhv/2022]

https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

download files & save on minio [storage/yelp]

https://www.yelp.com/dataset/download

execute spark application

docker exec -it spark-master /opt/bitnami/spark/bin/spark-submit \
  --master spark://spark-master:7077 \
  --deploy-mode client \
  /opt/bitnami/spark/jobs/elt-rides-fhvhv-py-strawberry.py

access spark history server

http://localhost:8080/
http://localhost:18080/

access MinIO UI

http://localhost:9001/

tier down resources

docker-compose down