Installation-of-Caffe-in-DGX

Installation instructions of Caffe

Docker container usage & Installation of CUDA 9.0

  1. Show the images on DGX sever
nvidia-docker images
  1. Map the local folder to the container you choose
nvidia-docker run -v /mnt/SA5_v1share/:/workspace/ -it --ipc host --name (your_name)-caffe (image_name)
  1. Start and connect back to previously created container
nvidia-docker start ()-caffe
nvidia-docker attach ()-caffe
  1. Download and Install CUDA 9.0 (sh cuda_9.0.176_384.81_linux.run), Do Not Install NVIDIA Accelerated Graphics Driver (first choice)
  2. Add these to bashrc, and source it
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda-9.0/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda9.0/lib64:$LD_LIBRARY_PATH

Installation of cuDNN for CUDA 9.0

  1. Download first
  2. Decompress the file
  3. In the bash command
cd cuda/include
cp cudnn.h /usr/local/cuda/include

cd cuda/lib64
cp lib* /usr/local/cuda/lib64/    
cd /usr/local/cuda/lib64/
rm -rf libcudnn.so libcudnn.so.7     
ln -s libcudnn.so.7.0.5 libcudnn.so.7  
ln -s libcudnn.so.7 libcudnn.so
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
  1. Install the runtime library
dpkg -i libcudnn7_7.0.5.15-1+cuda9.0_amd64.deb
  1. Install the developer library
dpkg -i libcudnn7-dev_7.0.5.15-1+cuda9.0_amd64.deb

Installation of NCCL

git clone https://github.com/NVIDIA/nccl.git
cd nccl
make install -j

Installation of Caffe

  1. Download it
git clone https://github.com/BVLC/caffe.git  
  1. In Makefile.config
cp Makefile.config.example Makefile.config
vim Makefile.config

from

#USE_CUDNN := 1
# USE_LEVELDB := 0
# USE_LMDB := 0
#WITH_PYTHON_LAYER := 1
# USE_NCCL := 1
INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib 

CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
                -gencode arch=compute_20,code=sm_21 \
                -gencode arch=compute_30,code=sm_30 \
                -gencode arch=compute_35,code=sm_35 \
                -gencode arch=compute_50,code=sm_50 \
                -gencode arch=compute_52,code=sm_52 \
                -gencode arch=compute_60,code=sm_60 \
                -gencode arch=compute_61,code=sm_61 \
                -gencode arch=compute_61,code=compute_61

to

USE_CUDNN := 1
USE_LEVELDB := 1
USE_LMDB := 1
WITH_PYTHON_LAYER := 1
USE_NCCL := 1
INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial
LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu/hdf5/serial

CUDA_ARCH :=    -gencode arch=compute_30,code=sm_30 \
                -gencode arch=compute_35,code=sm_35 \
                -gencode arch=compute_50,code=sm_50 \
                -gencode arch=compute_52,code=sm_52 \
                -gencode arch=compute_60,code=sm_60 \
                -gencode arch=compute_61,code=sm_61 \
                -gencode arch=compute_61,code=compute_61

  1. In Makefile

from

NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)

to

NVCCFLAGS += -D_FORCE_INLINES -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
  1. Compile
sudo cp /usr/local/cuda-9.0/lib64/libcudart.so.9.0 /usr/local/lib/libcudart.so.9.0 && sudo ldconfig
sudo cp /usr/local/cuda-9.0/lib64/libcublas.so.9.0 /usr/local/lib/libcublas.so.9.0 && sudo ldconfig
sudo cp /usr/local/cuda-9.0/lib64/libcurand.so.9.0 /usr/local/lib/libcurand.so.9.0 && sudo ldconfig
make all -j
make test -j
make runtest -j