[BigDL 2.0] Orca examples on k8s
sgwhat opened this issue · 11 comments
sgwhat commented
Module | Example | Added | Client Mode | Cluster Mode |
---|---|---|---|---|
bigdl | learn/bigdl/attention/transformer.py | Y | Succeed | Succeed |
bigdl | learn/bigdl/imageInference/imageInference.py | Y | Succeed | |
horovod | learn/horovod/pytorch_estimator.py | Y | Succeed | Succeed |
horovod | simple_horovod_pytorch.py | Y | Succeed | |
mxnet | learn/mxnet/lenet_mnist.py | Y | Succeed | |
openvino | learn/openvino/predict.py | Y | Failed | |
ray_on_spark | ray_on_spark/parameter_server/async_parameter_server.py | Y | Succeed | |
ray_on_spark | ray_on_spark/parameter_server/sync_parameter_server.py | Y | Succeed | |
ray_on_spark | ray_on_spark/rl_pong/rl_pong.py | Y | Succeed | Failed |
ray_on_spark | ray_on_spark/rllib/multiagent_two_trainers.py | Y | Succeed | Succeed |
tfpark | tfpark/estimator/estimator_dataset.py | Y | Succeed | Failed |
tfpark | tfpark/estimator/estimator_inception.py | Y | Succeed | Failed |
tfpark | tfpark/gan/gan_train_and_evaluate.py | Y | Succeed | |
tfpark | tfpark/keras/keras_dataset.py | Y | Succeed | |
tfpark | tfpark/keras/keras_ndarray.py | Y | Succeed | |
tfpark | tfpark/tf_optimizer/evaluate.py | Y | Succeed | |
tfpark | tfpark/tf_optimizer/train.py | Y | Succeed |
sgwhat commented
transformer.py
Client Command
"################## start transformer.py client "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
${SPARK_HOME}/bin/spark-submit \
--master ${RUNTIME_SPARK_MASTER} \
--deploy-mode client \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=${RUNTIME_K8S_SERVICE_ACCOUNT} \
--name bigdl2-transformer \
--conf spark.kubernetes.container.image=${RUNTIME_K8S_SPARK_IMAGE} \
--conf spark.executor.instances=${RUNTIME_EXECUTOR_INSTANCES} \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.options.claimName=${RUNTIME_PERSISTENT_VOLUME_CLAIM} \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.mount.path=/tmp \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.options.claimName=${RUNTIME_PERSISTENT_VOLUME_CLAIM} \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.mount.path=/tmp \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=${http_proxy} \
--conf spark.kubernetes.driverEnv.https_proxy=${https_proxy} \
--conf spark.kubernetes.executorEnv.http_proxy=${http_proxy} \
--conf spark.kubernetes.executorEnv.https_proxy=${https_proxy} \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--executor-cores ${RUNTIME_EXECUTOR_CORES} \
--executor-memory ${RUNTIME_EXECUTOR_MEMORY} \
--total-executor-cores ${RUNTIME_TOTAL_EXECUTOR_CORES} \
--driver-cores ${RUNTIME_DRIVER_CORES} \
--driver-memory ${RUNTIME_DRIVER_MEMORY} \
--properties-file ${BIGDL_HOME}/conf/spark-bigdl.conf \
--py-files local://${BIGDL_HOME}/python/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-serving-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/learn/transformer.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local://${BIGDL_HOME}/jars/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar \
--conf spark.executor.extraClassPath=local://${BIGDL_HOME}/jars/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end transformer.py client "
echo "run time is: "$((end_seconds-start_seconds))"s"
Cluster Command
"################## start transformer.py cluster"
starttime=`date +'%Y-%m-%d %H:%M:%S'`
${SPARK_HOME}/bin/spark-submit \
--master ${RUNTIME_SPARK_MASTER} \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=${RUNTIME_K8S_SERVICE_ACCOUNT} \
--name bigdl2-transformer\
--conf spark.kubernetes.container.image=${RUNTIME_K8S_SPARK_IMAGE} \
--conf spark.executor.instances=${RUNTIME_EXECUTOR_INSTANCES} \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.options.claimName=${RUNTIME_PERSISTENT_VOLUME_CLAIM} \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.mount.path=/tmp \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.options.claimName=${RUNTIME_PERSISTENT_VOLUME_CLAIM} \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.${RUNTIME_PERSISTENT_VOLUME_CLAIM}.mount.path=/tmp \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=${http_proxy} \
--conf spark.kubernetes.driverEnv.https_proxy=${https_proxy} \
--conf spark.kubernetes.executorEnv.http_proxy=${http_proxy} \
--conf spark.kubernetes.executorEnv.https_proxy=${https_proxy} \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--executor-cores ${RUNTIME_EXECUTOR_CORES} \
--executor-memory ${RUNTIME_EXECUTOR_MEMORY} \
--total-executor-cores ${RUNTIME_TOTAL_EXECUTOR_CORES} \
--driver-cores ${RUNTIME_DRIVER_CORES} \
--driver-memory ${RUNTIME_DRIVER_MEMORY} \
--properties-file ${BIGDL_HOME}/conf/spark-bigdl.conf \
--py-files local://${BIGDL_HOME}/python/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-serving-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local://${BIGDL_HOME}/python/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local://${BIGDL_HOME}/jars/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar \
--conf spark.executor.extraClassPath=local://${BIGDL_HOME}/jars/bigdl-orca-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-dllib-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar:local://${BIGDL_HOME}/jars/bigdl-friesian-spark_${SPARK_VERSION}-${BIGDL_VERSION}-jar-with-dependencies.jar \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end transformer.py cluster"
echo "run time is: "$((end_seconds-start_seconds))"s"
Client Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py", line 98, in <module>
epochs=1)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-friesian-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/orca/learn/bigdl/estimator.py", line 181, in fit
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-friesian-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/estimator/estimator.py", line 145, in train
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-friesian-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 164, in callZooFunc
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-friesian-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 158, in callZooFunc
File "/opt/work/spark-3.1.2/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/opt/work/spark-3.1.2/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o69.estimatorTrain.
: java.lang.StackOverflowError
Cluster Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py", line 19, in <module>
import numpy as np
ModuleNotFoundError: No module named 'numpy'
sgwhat commented
ImageInference
Client Exception
Cluster Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/openvino/predict.py", line 19, in <module>
import numpy as np
ModuleNotFoundError: No module named 'numpy'
sgwhat commented
Openvino
Client Exception
from openvino.inference_engine import IECore
ModuleNotFoundError: No module named 'openvino'
sgwhat commented
Mxnet
Client Exception
OSError: libcudart.so.9.0: cannot open shared object file: No such file or directory
sgwhat commented
Transformer
Client Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/learn/bigdl/attention/transformer.py", line 100, in <module>
epochs=1)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/orca/learn/bigdl/estimator.py", line 181, in fit
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/estimator/estimator.py", line 145, in train
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 164, in callZooFunc
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 158, in callZooFunc
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o74.estimatorTrain.
: java.lang.StackOverflowError
sgwhat commented
Tfpark Gan
Command
echo "################## start nnframes/ImageInferenceExample.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name test-bigdl2-cluster-tfpark_est \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.deleteOnTermination=false \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 4 \
--executor-memory 50g \
--total-executor-cores 32 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-orca-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/evaluate.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/evaluate.py 1000\
Cluster Exception
ModuleNotFoundError: No module named 'nets'
sgwhat commented
rl_pong
Command
echo "################## start nnframes/ImageInferenceExample.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name test-bigdl2-cluster-transformer \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.deleteOnTermination=false \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 4 \
--executor-memory 50g \
--total-executor-cores 32 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-orca-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/ray_on_spark/rl_pong/rl_pong.py \
--conf spark.executor.extraJavaOptions=-Xss512m \
--conf spark.driver.extraJavaOptions=-Xss512m \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/ray_on_spark/rl_pong/rl_pong.py \
Cluster Exception
Exception: ROM is missing for pong
ManfeiBai commented
tfpark/tf_optimizer/evaluate.py cluster
Command
echo "################## start tfpark/tf_optimizer/evaluate.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
docker exec -i $CONTAINER_NAME bash -c "export PYTHONHOME=/usr/local/envs/pytf1 && \
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name bigdl2-basic_text_classification \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 16 \
--executor-memory 50g \
--total-executor-cores 64 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/evaluate.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/evaluate.py \
--data_path /bigdl2.0/data/MNIST"
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end tfpark/tf_optimizer/evaluate.py cluster"
echo "run time is: "$((end_seconds-start_seconds))"s"
Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/evaluate.py", line 26, in <module>
from nets import lenet
ModuleNotFoundError: No module named 'nets'
ManfeiBai commented
tfpark/tf_optimizer/train.py cluster
Command
echo "################## start tfpark/tf_optimizer/train.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
docker exec -i $CONTAINER_NAME bash -c "export PYTHONHOME=/usr/local/envs/pytf1 && \
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name bigdl2-basic_text_classification \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 16 \
--executor-memory 50g \
--total-executor-cores 64 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/train.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/train.py"
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end tfpark/tf_optimizer/evaluate.py cluster"
echo "run time is: "$((end_seconds-start_seconds))"s"
Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/tf_optimizer/train.py", line 27, in <module>
from nets import lenet
ModuleNotFoundError: No module named 'nets'
ManfeiBai commented
estimator/estimator_dataset.py
Command
echo "################## start estimator/estimator_dataset.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
docker exec -i $CONTAINER_NAME bash -c "export PYTHONHOME=/usr/local/envs/pytf1 && \
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name bigdl2-basic_text_classification \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 16 \
--executor-memory 50g \
--total-executor-cores 64 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py"
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end estimator/estimator_dataset.py cluster"
echo "run time is: "$((end_seconds-start_seconds))"s"
Exception
Downloading data from http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
270336/9912422 [..............................] - ETA: 58974sTraceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py", line 78, in <module>
main()
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py", line 65, in main
estimator.train(input_fn, steps=10)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/orca/tfpark/estimator.py", line 142, in train
File "/usr/local/envs/pytf1/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1116, in _call_input_fn
return input_fn(**kwargs)
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py", line 53, in input_fn
training_data = get_data("train")
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_dataset.py", line 26, in get_data
(images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/feature/dataset/mnist.py", line 101, in read_data_sets
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/feature/dataset/base.py", line 194, in maybe_download
File "/usr/local/envs/pytf1/lib/python3.7/urllib/request.py", line 288, in urlretrieve
% (read, size), result)
urllib.error.ContentTooShortError: <urlopen error retrieval incomplete: got only 262190 out of 9912422 bytes>
ManfeiBai commented
estimator/estimator_inception.py
Command
echo "################## start estimator/estimator_inception.py cluster "
starttime=`date +'%Y-%m-%d %H:%M:%S'`
docker exec -i $CONTAINER_NAME bash -c "export PYTHONHOME=/usr/local/envs/pytf1 && \
/opt/spark/bin/spark-submit \
--master k8s://https://127.0.0.1:8443 \
--deploy-mode cluster \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
--name bigdl2-basic_text_classification \
--conf spark.kubernetes.container.image=10.239.45.10/arda/intelanalytics/bigdl-k8s-spark-3.1.2:0.14.0-SNAPSHOT \
--conf spark.executor.instances=1 \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.driver.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.options.claimName=nfsvolumeclaim \
--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.nfsvolumeclaim.mount.path=/bigdl2.0/data \
--conf spark.kubernetes.driver.label.az=true \
--conf spark.kubernetes.executor.label.az=true \
--conf spark.kubernetes.node.selector.spark=true \
--conf spark.kubernetes.driverEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.driverEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.http_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.executorEnv.https_proxy=http://child-prc.intel.com:913 \
--conf spark.kubernetes.container.image.pullPolicy=Always \
--conf spark.pyspark.driver.python=/usr/local/envs/pytf1/bin/python \
--conf spark.pyspark.python=/usr/local/envs/pytf1/bin/python \
--conf spark.executorEnv.PYTHONHOME=/usr/local/envs/pytf1 \
--executor-cores 16 \
--executor-memory 50g \
--total-executor-cores 64 \
--driver-cores 4 \
--driver-memory 50g \
--properties-file /opt/bigdl-0.14.0-SNAPSHOT/conf/spark-bigdl.conf \
--py-files local:///opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip,local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_inception.py \
--conf spark.driver.extraJavaOptions=-Dderby.stream.error.file=/tmp \
--conf spark.sql.catalogImplementation='in-memory' \
--conf spark.driver.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
--conf spark.executor.extraClassPath=local:///opt/bigdl-0.14.0-SNAPSHOT/jars/* \
local:///opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_inception.py \
--image-path /bigdl2.0/data/imagenet/train/n02085620 \
--num-classes 2\
--batch_size 64"
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
echo "################## end estimator/estimator_inception.py cluster"
echo "run time is: "$((end_seconds-start_seconds))"s"
Exception
Traceback (most recent call last):
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_inception.py", line 93, in <module>
main(options)
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_inception.py", line 82, in main
estimator.train(input_fn, steps=100)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/orca/tfpark/estimator.py", line 142, in train
File "/usr/local/envs/pytf1/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1116, in _call_input_fn
return input_fn(**kwargs)
File "/opt/bigdl-0.14.0-SNAPSHOT/examples/orca/tfpark/estimator/estimator_inception.py", line 36, in input_fn
sc=sc, with_label=True, one_based_label=False)
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/feature/image/imageset.py", line 84, in read
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 164, in callZooFunc
File "/opt/bigdl-0.14.0-SNAPSHOT/python/bigdl-spark_3.1.2-0.14.0-SNAPSHOT-python-api.zip/bigdl/dllib/utils/file_utils.py", line 158, in callZooFunc
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o76.readImageSet.
: java.lang.IllegalArgumentException: Can not create a Path from an empty string
at org.apache.hadoop.fs.Path.checkPathArg(Path.java:126)
at org.apache.hadoop.fs.Path.<init>(Path.java:134)
at org.apache.hadoop.util.StringUtils.stringToPath(StringUtils.java:245)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:469)
at org.apache.spark.SparkContext.$anonfun$binaryFiles$1(SparkContext.scala:1017)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:786)
at org.apache.spark.SparkContext.binaryFiles(SparkContext.scala:1012)
at com.intel.analytics.bigdl.dllib.feature.image.ImageSet$.readToDistributedImageSet(ImageSet.scala:267)
at com.intel.analytics.bigdl.dllib.feature.image.ImageSet$.read(ImageSet.scala:243)
at com.intel.analytics.bigdl.dllib.feature.python.PythonImageFeature.readImageSet(PythonImageFeature.scala:65)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)