CUDA Problem: Invalid Device function
When build caffe windows with cuda, I meet this problem:
F0907 15:41:09.264920 202420 im2col.cu:61] Check failed: error == cudaSuccess (8 vs. 0) invalid device function
This project helps me to find the solution, hope to help you.
My PC environment
- Windows 11
- Visual Studio 2013
- CMake 3.18.0
- Ninja 1.11.1
- CUDA 7.5
- NVIDIA GeForce MX250
Normal build and exec
CMake support CUDA C++ from 3.9, so I make this sample project, to my surprise, it can run with no errors. What's the difference between this sample project and caffe windows?
build.bat
build\main.exe
Last error: 0
Last error: 0
0 means exec successfully.
Error build and exec
ninja -t commands
prints all commands that to build main.exe. We see about gpu code is --generate-code=arch=compute_20,code=[compute_20,sm_20]
, use this method when build caffe windows, the gpu code is --generate-code=arch=compute_20,code=sm_20
. Is it reason of this problem ?
C:\Users\hw\Desktop\src\InvalidDeviceFunction\build>ninja -t commands main.exe
C:\PROGRA~2\MICROS~3.0\VC\bin\amd64\cl.exe /nologo /TP /DWIN32 /D_WINDOWS /W3 /GR /EHsc /MDd /Zi /Ob0 /Od /RTC1 /showIncludes /FoCMakeFiles\main.dir\main.cc.obj /FdCMakeFiles\main.dir\ /FS -c ..\main.cc
cmd.exe /C "C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\bin\nvcc.exe -D_WINDOWS -Xcompiler="/W3 /GR /EHsc" -Xcompiler="-MDd -Zi -Ob0 -Od /RTC1" --generate-code=arch=compute_20,code=[compute_20,sm_20] -x cu -c ..\im2col.cu -o CMakeFiles\main.dir\im2col.cu.obj -Xcompiler=-FdCMakeFiles\main.dir\,-FS && C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\bin\nvcc.exe -D_WINDOWS -Xcompiler="/W3 /GR /EHsc" -Xcompiler="-MDd -Zi -Ob0 -Od /RTC1" --generate-code=arch=compute_20,code=[compute_20,sm_20] -x cu -M ..\im2col.cu -MT CMakeFiles\main.dir\im2col.cu.obj -o CMakeFiles\main.dir\im2col.cu.obj.d"
cmd.exe /C "cd . && "C:\Program Files\CMake\bin\cmake.exe" -E vs_link_exe --intdir=CMakeFiles\main.dir --rc=C:\PROGRA~2\WI3CF2~1\8.1\bin\x64\rc.exe --mt=C:\PROGRA~2\WI3CF2~1\8.1\bin\x64\mt.exe --manifests -- C:\PROGRA~2\MICROS~3.0\VC\bin\amd64\link.exe /nologo CMakeFiles\main.dir\main.cc.obj CMakeFiles\main.dir\im2col.cu.obj /out:main.exe /implib:main.lib /pdb:main.pdb /version:0.0 /machine:x64 /debug /INCREMENTAL /subsystem:console -LIBPATH:C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\lib\x64 cudadevrt.lib cudart_static.lib kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib && cd ."
So I change --generate-code=arch=compute_20,code=[compute_20,sm_20]
to --generate-code=arch=compute_20,code=sm_20
and use following commands to build again.
C:\PROGRA~2\MICROS~3.0\VC\bin\amd64\cl.exe /nologo /TP /DWIN32 /D_WINDOWS /W3 /GR /EHsc /MDd /Zi /Ob0 /Od /RTC1 /showIncludes /FoCMakeFiles\main.dir\main.cc.obj /FdCMakeFiles\main.dir\ /FS -c ..\main.cc
cmd.exe /C "C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\bin\nvcc.exe -D_WINDOWS -Xcompiler="/W3 /GR /EHsc" -Xcompiler="-MDd -Zi -Ob0 -Od /RTC1" --generate-code=arch=compute_20,code=sm_20 -x cu -c ..\im2col.cu -o CMakeFiles\main.dir\im2col.cu.obj -Xcompiler=-FdCMakeFiles\main.dir\,-FS && C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\bin\nvcc.exe -D_WINDOWS -Xcompiler="/W3 /GR /EHsc" -Xcompiler="-MDd -Zi -Ob0 -Od /RTC1" --generate-code=arch=compute_20,code=sm_20 -x cu -M ..\im2col.cu -MT CMakeFiles\main.dir\im2col.cu.obj -o CMakeFiles\main.dir\im2col.cu.obj.d"
cmd.exe /C "cd . && "C:\Program Files\CMake\bin\cmake.exe" -E vs_link_exe --intdir=CMakeFiles\main.dir --rc=C:\PROGRA~2\WI3CF2~1\8.1\bin\x64\rc.exe --mt=C:\PROGRA~2\WI3CF2~1\8.1\bin\x64\mt.exe --manifests -- C:\PROGRA~2\MICROS~3.0\VC\bin\amd64\link.exe /nologo CMakeFiles\main.dir\main.cc.obj CMakeFiles\main.dir\im2col.cu.obj /out:main.exe /implib:main.lib /pdb:main.pdb /version:0.0 /machine:x64 /debug /INCREMENTAL /subsystem:console -LIBPATH:C:\PROGRA~1\NVIDIA~2\CUDA\v7.5\lib\x64 cudadevrt.lib cudart_static.lib kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib && cd ."
And the result is :
C:\Users\hw\Desktop\src\InvalidDeviceFunction\build>main.exe
Last error: 8
Last error: 8
In CUDA 7.5, 8 means invalid device function
.
Conclusion
So one solution of invalid device function
is : Change --generate-code=arch=compute_20,code=sm_20
to --generate-code=arch=compute_20,code=[compute_20,sm_20]
. For caffe windows project, should modify .\cmake\Cuda.cmake
,
# Tell NVCC to add binaries for the specified GPUs
foreach(__arch ${__cuda_arch_bin})
if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
# list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=[compute_${CMAKE_MATCH_2},sm_${CMAKE_MATCH_1}])
list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
# list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=[compute_${__arch},sm_${__arch}])
list(APPEND __nvcc_archs_readable sm_${__arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(__arch ${__cuda_arch_ptx})
list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
list(APPEND __nvcc_archs_readable compute_${__arch})
endforeach()