$ python generator.py > input
# Generate a 700MB file with random words
$ cd $SPARK_HOME
$ bin/hdfs namenode -format
$ bin/hadoop fs -mkdir -p /user/$USUARIO/
$ sbin/start-all.sh
$ bin/hadoop -put input
$ bin/mapred streaming \
-input input \
-output output1 \
-mapper mapper.py \
-reducer reducer.py \
-file reducer.py -file mapper.py \
&& \
mapred streaming \
-input output1 \
-output output2 \
-mapper mapper2.py \
-reducer reducer2.py \
-file reducer2.py -file mapper2.py
$ cd $SPARK_HOME
$ bin/start-master.sh
$ bin/start-worker.sh spark://MASTER_HOST:MASTER_PORT
bin/spark-submit $PROJECT_PATH/sparkOtimized.py
** Altere a variavel hdfs_path nos arquivos spark*.py com o caminho do HDFS