kblomdahl/dream-go

Poor GPU utilization observed during play

kblomdahl opened this issue · 2 comments

With modern drivers we are observing very poor GPU utilization during self-play and normal play. nvidia-smi shows up about 40% utilization when running on dual-GPU, on a single GPU it gives about 60% utilization (since it doesn't have to wait for the slower of the two):

Thu Apr 30 19:20:20 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 440.82       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce RTX 2070    Off  | 00000000:65:00.0  On |                  N/A |
|  0%   34C    P2    63W / 185W |   1484MiB /  7979MiB |     39%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:B3:00.0 Off |                  N/A |
|  0%   34C    P2    97W / 260W |    640MiB / 11019MiB |     37%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1253      G   /usr/lib/xorg/Xorg                           416MiB |
|    0      2058      G   /usr/bin/kwin_x11                            112MiB |
|    0      2062      G   /usr/bin/krunner                              20MiB |
|    0      2064      G   /usr/bin/plasmashell                          99MiB |
|    0      2892      G   ...AAAAAAAAAAAACAAAAAAAAAA= --shared-files   114MiB |
|    0      3478      G   ...quest-channel-token=3743066830833839882    40MiB |
|    0      4017      C   ./target/release/dream_go                    556MiB |
|    0     18141      G   ...quest-channel-token=6788427025990345763   112MiB |
|    1      4017      C   ./target/release/dream_go                    628MiB |
+-----------------------------------------------------------------------------+

Output from nvprof for a single game of self-play.

==30905== NVPROF is profiling process 30905, command: ./target/release/dream_go --self-play 1
.==30905== Profiling application: ./target/release/dream_go --self-play 1
==30905== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   43.84%  14.9625s    535192  27.957us  15.392us  117.02us  turing_h1688cudnn_256x64_sliced1x2_ldg8_relu_exp_small_nhwc_tn_v1
                   25.56%  8.72299s    258343  33.765us  15.168us  201.54us  turing_h1688cudnn_128x128_ldg8_relu_exp_small_nhwc_tn_v1
                   21.09%  7.20001s     83530  86.196us  52.576us  306.17us  void conv2d_grouped_direct_kernel<__half, __half, __half, __half, float, bool=1, bool=1, int=1, int=0, int=0>(cudnnTensorStruct, __half const *, cudnnFilterStruct, __half const *, cudnnConvolutionStruct, cudnnTensorStruct, __half*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int, __half const *, float const *, cudnnActivationStruct)
                    2.69%  916.78ms    793535  1.1550us     832ns  14.976us  cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                    1.75%  596.58ms     41987  14.208us     608ns  225.31us  [CUDA memcpy HtoD]
                    0.61%  207.69ms     10478  19.821us  13.952us  40.479us  volta_fp16_sgemm_fp16_32x32_sliced1x4_nn
                    0.58%  196.49ms     41765  4.7040us  3.3920us  11.648us  void cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=1>(cudnnTensorStruct, __half const *, cudnn::detail::softmax_fw_kernel<int=2, __half, float, int=256, int=1, int=0, int=1>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int)
                    0.55%  189.21ms     11226  16.854us  7.9040us  117.25us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=4, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.54%  183.81ms     42020  4.3740us  2.2080us  73.024us  void gemv2N_kernel<int, int, __half, float, float, int=128, int=4, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<float>, float>>(__half const )
                    0.40%  136.30ms     83530  1.6310us  1.1520us  34.400us  void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=1, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
                    0.30%  103.13ms      6859  15.035us  7.5200us  39.296us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=2, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.28%  95.007ms      5898  16.108us  9.0560us  67.904us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=3, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.24%  81.598ms     83530     976ns     384ns  38.944us  [CUDA memcpy DtoH]
                    0.23%  79.633ms      4091  19.465us  12.000us  89.344us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=5, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.23%  76.823ms     42020  1.8280us  1.0560us  8.9280us  void splitKreduce_kernel<float, __half, float>(cublasSplitKParams<float>, float const *, __half const *, __half*, float const *, float const *)
                    0.21%  71.655ms     41765  1.7150us  1.1200us  31.200us  void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
                    0.19%  65.043ms     21010  3.0950us  1.4720us  68.192us  void gemv2N_kernel<int, int, __half, __half, float, int=128, int=32, int=4, int=4, int=1, cublasGemvParams<cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>, float>>(__half const )
                    0.18%  61.758ms     41765  1.4780us     992ns  15.264us  void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=7, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
                    0.18%  60.326ms     41765  1.4440us     992ns  29.984us  void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=1, int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
                    0.10%  34.804ms     13753  2.5300us  1.4080us  38.368us  void gemv2T_kernel_val<int, int, __half, __half, float, int=128, int=16, int=2, int=4, bool=0, cublasGemvParams<cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>, float>>(__half const , float, float)
                    0.10%  33.967ms      1390  24.436us  15.007us  151.20us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=7, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.08%  28.820ms      1568  18.380us  9.7920us  67.200us  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=6, int=4, cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>>(cublasGemmSmallNParams<__half const , cublasGemvTensorStridedBatched<__half const >, float>)
                    0.03%  10.617ms      3549  2.9910us  1.5360us  8.6070us  void gemv2T_kernel_val<int, int, __half, __half, float, int=128, int=16, int=4, int=4, bool=0, cublasGemvParams<cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>, float>>(__half const , float, float)
                    0.03%  9.6898ms      3453  2.8060us  1.5680us  67.232us  void gemv2T_kernel_val<int, int, __half, __half, float, int=128, int=16, int=2, int=2, bool=0, cublasGemvParams<cublasGemvTensorStridedBatched<__half const >, cublasGemvTensorStridedBatched<__half>, float>>(__half const , float, float)
                    0.02%  8.2278ms     10930     752ns     576ns  10.464us  [CUDA memset]
                    0.00%  29.600us        18  1.6440us  1.1520us  4.2880us  void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, float)
      API calls:   64.33%  47.8098s   2088523  22.891us  3.8250us  10.163ms  cudaLaunchKernel
                    8.39%  6.23677s    637208  9.7870us     445ns  7.1923ms  cudaEventRecord
                    7.53%  5.59741s    125391  44.639us  2.7200us  49.115ms  cudaMemcpyAsync
                    5.07%  3.76672s    208825  18.037us  1.9040us  4.1048ms  cudaFuncGetAttributes
                    4.26%  3.16411s       693  4.5658ms  1.2110us  1.59355s  cudaStreamCreateWithFlags
                    2.53%  1.88114s    334120  5.6300us     367ns  3.8560ms  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
                    1.90%  1.41458s   2882058     490ns      88ns  1.2191ms  cudaGetLastError
                    1.57%  1.16758s    793724  1.4710us     240ns  1.7647ms  cudaGetDevice
                    1.14%  847.80ms    839336  1.0100us     220ns  1.7189ms  cudaDeviceGetAttribute
                    0.99%  732.23ms      1323  553.46us     261ns  213.56ms  cudaFree
                    0.58%  431.43ms     83530  5.1650us     992ns  713.91us  cudaStreamSynchronize
                    0.42%  309.91ms     94460  3.2800us     180ns  1.0465ms  cudaMemsetAsync
                    0.41%  304.99ms     53130  5.7400us     828ns  831.98us  cudaEventQuery
                    0.33%  241.95ms     83530  2.8960us     463ns  698.18us  cudaStreamWaitEvent
                    0.25%  186.20ms        65  2.8647ms  3.6480us  184.12ms  cudaMallocHost
                    0.19%  142.49ms     84456  1.6870us     258ns  545.61us  cudaMalloc
                    0.06%  42.287ms     21357  1.9800us     280ns  865.75us  cudaFuncSetAttribute
                    0.01%  8.3086ms      2835  2.9300us     339ns  174.73us  cudaEventCreateWithFlags
                    0.01%  8.2401ms       252  32.698us  1.1750us  841.57us  cudaStreamCreateWithPriority
                    0.01%  7.3936ms        63  117.36us  26.856us  613.39us  cudaHostAlloc
                    0.01%  4.1561ms       126  32.985us  11.277us  125.13us  cudaMemcpy
                    0.01%  4.0971ms       945  4.3350us  2.0160us  31.280us  cudaStreamDestroy
                    0.01%  3.9994ms       128  31.245us  5.7650us  393.20us  cudaFreeHost
                    0.00%  2.3575ms         8  294.69us  138.77us  638.69us  cuDeviceTotalMem
                    0.00%  2.2734ms      2835     801ns     390ns  16.045us  cudaEventDestroy
                    0.00%  1.9259ms       758  2.5400us     102ns  278.28us  cuDeviceGetAttribute
                    0.00%  999.59us        64  15.618us     183ns  954.07us  cudaGetDeviceCount
                    0.00%  948.39us       378  2.5080us  1.3240us  12.137us  cudaDeviceSynchronize
                    0.00%  198.14us         8  24.767us  12.046us  53.910us  cuDeviceGetName
                    0.00%  165.92us        63  2.6330us     783ns  4.7200us  cudaHostGetDevicePointer
                    0.00%  131.22us        63  2.0820us     703ns  3.9480us  cudaDeviceGetStreamPriorityRange
                    0.00%  43.497us         4  10.874us  7.5000us  12.475us  cudaSetDevice
                    0.00%  8.1900us         2  4.0950us  3.3720us  4.8180us  cuDeviceGetPCIBusId
                    0.00%  5.0560us         3  1.6850us  1.1680us  2.6610us  cuInit
                    0.00%  3.1610us        10     316ns     147ns     742ns  cuDeviceGet
                    0.00%  2.3110us         8     288ns     134ns     606ns  cuDeviceGetUuid
                    0.00%  2.2910us         6     381ns     150ns  1.0180us  cuDeviceGetCount
                    0.00%  1.8660us         3     622ns     403ns  1.0260us  cuDriverGetVersion
                    0.00%     277ns         2     138ns      85ns     192ns  cudaRuntimeGetVersion

Output from perf from a single game of self-play. Most of the time is spent polling for a response from channels, and updating the board state:

  12,06%  predict_worker  dream_go                   [.] crossbeam_channel::flav
   9,35%  predict_worker  dream_go                   [.] dg_go::board_fast::Boar
   8,80%  predict_worker  dream_go                   [.] crossbeam_channel::cont
   3,86%  predict_worker  dream_go                   [.] dg_mcts::tree::PUCT::ge
   3,20%  predict_worker  [kernel]                   [k] 0xffffffffbc203c23
   2,93%  predict_worker  libc-2.27.so               [.] __memmove_avx_unaligned
   2,40%  predict_worker  dream_go                   [.] dg_mcts::tree::PUCT::ge
   2,27%  predict_worker  dream_go                   [.] <dg_go::board::Board as
   1,85%  predict_worker  dream_go                   [.] dg_mcts::asm::argmax::_
   1,55%  predict_worker  dream_go                   [.] dg_go::board_fast::Boar
   1,47%  predict_worker  dream_go                   [.] dg_mcts::tree::FPU::app
   1,15%  predict_worker  dream_go                   [.] dg_mcts::tree::FPU::app
   0,98%  predict_worker  libc-2.27.so               [.] _int_malloc
   0,94%  predict_worker  dream_go                   [.] dg_mcts::predict_worker
   0,90%  service_worker  libpthread-2.27.so         [.] __pthread_mutex_lock

perf