specifying the cuda compute capability
cwsmith opened this issue · 4 comments
Prior to 976a340 the Omega_h_CUDA_ARCH
flag was needed by cmake to set the CUDA compute capability (e.g., arch=sm_70
). How should the compute capability be specified now? I did some searching around the cmake documentation and did not see anything obvious that happens when enable_language(cuda)
is called.
My cmake command for v9.32.0 is
cmake ../omega_h/ \
-DBUILD_SHARED_LIBS=OFF \
-DOmega_h_USE_CUDA=on \
-DOmega_h_USE_MPI=on \
-DCMAKE_CXX_COMPILER=/opt/ibm/spectrum_mpi/bin/mpicxx \
-DOmega_h_USE_Kokkos=ON \
-DKokkos_PREFIX=../build-kokkos-dcs-gcc74-cuda/install/lib/CMake/
and make VERBOSE=1
indicates nvcc
is used for compilation and mpicxx
(which runs nvcc_wrapper
) is used for linking. My initial guess is that the cuda arch flags are expected to come from Kokkos, but how that info gets passed to nvcc
is unclear.
Below is some selected output from make VERBOSE=1
showing the compile and link lines.
[ 91%] Building CUDA object src/CMakeFiles/osh_part.dir/osh_part.cpp.o
cd /gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/src && /usr/local/cuda-10.2/bin/nvcc -I/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/omega_h/src -I/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/src -I/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-kokkos-dcs-gcc74-cuda/install/include -isystem=/usr/local/cuda-10.2/include --compiler-options -W,-Wall,-Wextra,-Werror,-Wno-noexcept-type --Werror cross-execution-space-call,deprecated-declarations --expt-extended-lambda -x cu -c /gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/omega_h/src/osh_part.cpp -o CMakeFiles/osh_part.dir/osh_part.cpp.o
[ 92%] Linking CXX executable osh_part
cd /gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/src && /gpfs/u/software/dcs-spack-install/v0133gccSpectrum/linux-rhel7-power9le/gcc-7.4.0-1/cmake-3.15.4-mnqjvz6b3h6nmhnotnteqmxautpfm5vv/bin
/cmake -E cmake_link_script CMakeFiles/osh_part.dir/link.txt --verbose=1
/opt/ibm/spectrum_mpi/bin/mpicxx CMakeFiles/osh_part.dir/osh_part.cpp.o -o osh_part -Wl,-rpath,::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: libomega_h.a /gpfs/u/home/MPFS/M
PFSsmth/barn-shared/cws/software/build-kokkos-dcs-gcc74-cuda/install/lib/libkokkos.a /usr/lib64/libdl.so /usr/lib64/librt.so /usr/local/cuda-10.2/lib64/libcudart.so -lcuda /usr/lib64/libz.so
#$ _NVVM_BRANCH_=nvvm
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda-10.2/bin #$ _THERE_=/usr/local/cuda-10.2/bin #$ _TARGET_SIZE_= #$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/ppc64le-linux
#$ TOP=/usr/local/cuda-10.2/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda-10.2/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda-10.2/bin/../lib:/usr/local/cuda-10.2/lib64:/gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/lib64:/opt/ibm/spectrum_mpi/lib
#$ PATH=/usr/local/cuda-10.2/bin/../nvvm/bin:/usr/local/cuda-10.2/bin:/usr/local/cuda-10.2/bin:/gpfs/u/software/dcs-spack-install/v0133gccSpectrum/linux-rhel7-power9le/gcc-7.4.0-1/cmake-3.15.4-mnqjvz6b3h6nmhnotnteqmxautpfm5v
v/bin:/gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/bin:/opt/ibm/spectrum_mpi/bin:/usr/lpp/mmfs/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ibutils/bin:/gpfs/u/home/MPFS/MPFSsmth/.local/bin:/gpfs/u/home/MPFS/MPFS
smth/bin
#$ INCLUDES="-I/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/lib/stubs" "-L/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/lib" #$ CUDAFE_FLAGS= #$ PTXAS_FLAGS= #$ nvlink --arch=sm_35 --register-link-binaries="/tmp/tmpxft_00014144_00000000-2_osh_part_dlink.reg.c" -m64 -L"/opt/ibm/spectrum_mpi/lib" -lcuda -lmpiprofilesupport -lmpi_ibm "-L/usr/local/cuda-10.2/bin/../targets/ppc64le
-linux/lib/stubs" "-L/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/lib" -cpu-arch=PPC64LE "CMakeFiles/osh_part.dir/osh_part.cpp.o" "libomega_h.a" "/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-kokkos-dcs-gcc7
4-cuda/install/lib/libkokkos.a" "/usr/lib64/libdl.so" "/usr/lib64/librt.so" "/usr/local/cuda-10.2/lib64/libcudart.so" "/usr/lib64/libz.so" -lcudadevrt -o "/tmp/tmpxft_00014144_00000000-4_osh_part_dlink.sm_35.cubin"
#$ fatbinary -64 -link "--image3=kind=elf,sm=35,file=/tmp/tmpxft_00014144_00000000-4_osh_part_dlink.sm_35.cubin" --embedded-fatbin="/tmp/tmpxft_00014144_00000000-3_osh_part_dlink.fatbin.c"
#$ rm /tmp/tmpxft_00014144_00000000-3_osh_part_dlink.fatbin
#$ g++ -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00014144_00000000-3_osh_part_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00014144_00000000-2_osh_part_dlink.reg.c\"" -I. -D__NV_EXTRA_INITIALIZATION= -D__NV_EXTRA_FI
NALIZATION= -D__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ -I"/opt/ibm/spectrum_mpi/include" "-I/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/include" -D__CUDACC_VER_MAJOR__=10 -D__CUDACC_VER_MINOR__=2 -D__CUDACC_VER_B
UILD__=89 "/usr/local/cuda-10.2/bin/crt/link.stub" -o "/tmp/tmpxft_00014144_00000000-5_osh_part_dlink.o"
#$ g++ -Wl,--start-group "/tmp/tmpxft_00014144_00000000-5_osh_part_dlink.o" "CMakeFiles/osh_part.dir/osh_part.cpp.o" "libomega_h.a" "/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-kokkos-dcs-gcc74-cuda/install/lib
/libkokkos.a" "/usr/lib64/libdl.so" "/usr/lib64/librt.so" "/usr/local/cuda-10.2/lib64/libcudart.so" "/usr/lib64/libz.so" -Xlinker -rpath -Xlinker ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: -L"/opt/ibm/spectrum_mpi/lib" -lcuda -lmpiprofilesupport -lmpi_ibm "-L/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/lib/stubs" "-L/usr/local/cuda-10.2/bin/../targets/ppc64le-linux/lib" -lcudadevrt -lcudart_static -lrt -lpthread -ldl -Wl,--end-group -o "osh_part" ```
I think we're transitioning to using CMake's native support for CUDA so you can do the following:
-DCMAKE_CUDA_FLAGS="-arch=sm70"
Thank you.
Using the https://github.com/cwsmith/omega_h/tree/missingHeader branch, the following environment settings and cmake
command got me through the build without any obvious issues on the RPI AiMOS (mini summit) system.
module use /gpfs/u/software/dcs-spack-install/v0133gccSpectrum/lmod/linux-rhel7-ppc64le/gcc/7.4.0-1/
module load spectrum-mpi/10.3-doq6u5y
module load gcc/7.4.0/1
module load \
cmake/3.15.4-mnqjvz6 \
cuda/10.2
export OMPI_CXX=g++
cmake ../omega_h/ -DCMAKE_INSTALL_PREFIX=/gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/install -DBUILD_SHARED_LIBS=OFF -DOmega_h_USE_CUDA=on -DOmega_h_USE_MPI=on -DCMAKE_CXX_COMPILER=/opt/
ibm/spectrum_mpi/bin/mpicxx -DCMAKE_CUDA_FLAGS=-arch=sm_70 -DOmega_h_USE_Kokkos=ON -DKokkos_PREFIX=../build-kokkos-dcs-gcc74-cuda/install/lib/CMake/
-DBUILD_TESTING=ON
ctest
reports two (of 21) tests failing. Should a new issue be created for this?
13/21 Testing: run_face_flux_test
13/21 Test: run_face_flux_test
Command: "/opt/ibm/spectrum_mpi/bin/mpirun" "-np" "1" "./face_flux_test"
Directory: /gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/src
"run_face_flux_test" start time: Mar 27 09:01 EDT
Output:
----------------------------------------------------------
:0: : block: [0,0,0], thread: [27,0,0] Assertion `View bounds error of view ` failed.
:0: : block: [0,0,0], thread: [28,0,0] Assertion `View bounds error of view ` failed.
<snip>
:0: : block: [0,0,0], thread: [95,0,0] Assertion `View bounds error of view ` failed.
terminate called after throwing an instance of 'thrust::system::system_error'
what(): for_each: failed to synchronize: cudaErrorAssert: device-side assert triggered
[dcsfen02:142270] *** Process received signal ***
[dcsfen02:142270] Signal: Aborted (6)
[dcsfen02:142270] Signal code: (-6)
[dcsfen02:142270] [ 0] [0x7fff91b804d8]
[dcsfen02:142270] [ 1] /usr/lib64/libc.so.6(abort+0x2b4)[0x7fff90292094]
[dcsfen02:142270] [ 2] /gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/lib64/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x1c4)[0x7fff90660644]
[dcsfen02:142270] [ 3] /gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/lib64/libstdc++.so.6(+0xab364)[0x7fff9065b364]
[dcsfen02:142270] [ 4] /gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/lib64/libstdc++.so.6(_ZSt9terminatev+0x20)[0x7fff9065b420]
[dcsfen02:142270] [ 5] /gpfs/u/software/ppc64le-rhel7/gcc/7.4.0/1/lib64/libstdc++.so.6(__cxa_throw+0x80)[0x7fff9065b8e0]
[dcsfen02:142270] [ 6] ./face_flux_test[0x10014e50]
[dcsfen02:142270] [ 7] ./face_flux_test[0x1001e0f8]
[dcsfen02:142270] [ 8] ./face_flux_test[0x1001c2c0]
[dcsfen02:142270] [ 9] ./face_flux_test[0x100198b8]
[dcsfen02:142270] [10] ./face_flux_test[0x1000f7f4]
[dcsfen02:142270] [11] /usr/lib64/libc.so.6(+0x25200)[0x7fff90275200]
[dcsfen02:142270] [12] /usr/lib64/libc.so.6(__libc_start_main+0xc4)[0x7fff902753f4]
[dcsfen02:142270] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node dcsfen02 exited on signal 6 (Aborted).
--------------------------------------------------------------------------
<end of output>
Test time = 2.95 sec
----------------------------------------------------------
Test Failed.
"run_face_flux_test" end time: Mar 27 09:01 EDT
"run_face_flux_test" time elapsed: 00:00:02
----------------------------------------------------------
17/21 Testing: warp_test_parallel
17/21 Test: warp_test_parallel
Command: "/opt/ibm/spectrum_mpi/bin/mpirun" "-np" "2" "./warp_test"
Directory: /gpfs/u/home/MPFS/MPFSsmth/barn-shared/cws/software/build-omegah-dcs-gcc74-cuda/src
"warp_test_parallel" start time: Mar 27 09:01 EDT
Output:
----------------------------------------------------------
warp_to_limit completed in one step
before adapting:
6000 tets, quality [0.62,0.85], 6000 >0.30
7930 edges, length [0.67,1.55], 132 <0.71, 7458 in [0.71,1.41], 340 >1.41
quality histogram:
0.00-0.10: 0
0.10-0.20: 0
0.20-0.30: 0
<snip>
test took 54.2491 seconds
vertex tag "metric" values are different
max diff at vertex 1008, comp 0, values 6.327799822371828e+01 vs 6.327799822371829e+01
edge tag "length" values are different
max diff at edge 6073, comp 0, values 1.124971094950606e+00 vs 1.124971094950607e+00
tet tag "quality" values are different
max diff at region 3483, comp 0, values 7.559526299369257e-01 vs 7.559526299369259e-01
This run, stored at "gold_warp_bad.osh",
does not match the gold at "gold_warp.osh"
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[40972,1],0]
Exit code: 2
--------------------------------------------------------------------------
<end of output>
Test time = 56.95 sec
----------------------------------------------------------
Test Failed.
"warp_test_parallel" end time: Mar 27 09:02 EDT
"warp_test_parallel" time elapsed: 00:00:56
----------------------------------------------------------
- I thought I disabled the face flux test... there is definitely something wron
g with it and its motivation has died down - The warp test is just saying that its answers are off by a fraction of machine epsilon from what they were the first time the test was run. I would delete this
gold_warp.osh
thing and try to run the tests twice after that. If that doesn't work, there is some tiny non-determinism or serial-parallel inconsistency to figure out that needs a separate issue filed.
Thank you.
- OK
rm -rf src/gold_warp.osh; ctest; ctest
still resulted in a failure forwarp_test_parallel
. I'll create a new issue for this.