Slow because of cudaMemcpyAsync at ndloop.c

$ bundle exec nvprof ruby examples/mnist.rb
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.04%  4.42518s   3094192  1.4300us  1.1520us  11.616us  [CUDA memcpy DtoD]
                    0.39%  17.498ms      2010  8.7050us  7.9040us  19.904us  void dfloat_sum_kernel<thrust::permutation_iterator<thrust::detail::normal_iterator<thrust::device_ptr<double>>, thrust::t
ransform_iterator<thrust_strided_range<thrust::detail::normal_iterator<thrust::device_ptr<double>>>::stride_functor, thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::
use_default>, thrust::use_default, thrust::use_default>>>(double, double, double*)
                    0.32%  14.102ms         8  1.7627ms  1.1336ms  2.2644ms  maxwell_dgemm_64x64_nn
                    0.09%  3.8597ms       128  30.153us  1.5360us  2.8224ms  iter_dfloat_mul_stride_kernel
                    0.04%  1.9669ms         2  983.47us  282.53us  1.6844ms  iter_dfloat_store_uint8_stride_stride_kernel
                    0.02%  1.0490ms       412  2.5460us  1.5680us  92.736us  iter_dfloat_add_contiguous_kernel
                    0.01%  511.77us       224  2.2840us  1.2480us  43.968us  iter_dfloat_store_dfloat_stride_stride_kernel
                    0.01%  506.27us        18  28.126us  1.5680us  90.463us  iter_dfloat_sub_contiguous_kernel
                    0.01%  470.08us       100  4.7000us  4.6400us  5.5360us  void dfloat_max_index_int32_kernel<thrust::device_ptr<double>>(double, double, char*, long, char*)
                    0.01%  372.32us       200  1.8610us  1.8240us  2.8800us  iter_dfloat_sub_stride_kernel
                    0.01%  314.85us       110  2.8620us  1.7600us  63.136us  iter_dfloat_add_stride_kernel
                    0.01%  313.18us       100  3.1310us  3.1030us  3.6480us  iter_dfloat_math_s_log_stride_stride_kernel
                    0.01%  291.87us       101  2.8890us  2.7520us  12.896us  void dfloat_sum_kernel<thrust::device_ptr<double>>(double, double, double*)
                    0.01%  289.02us         6  48.170us  2.6240us  151.14us  iter_dfloat_div_contiguous_kernel
                    0.01%  282.53us       100  2.8250us  2.7840us  3.4240us  void dfloat_max_kernel<thrust::device_ptr<double>>(double, double, double*)
                    0.01%  233.57us        12  19.463us  2.4960us  106.27us  iter_dfloat_math_s_sqrt_stride_stride_kernel
                    0.00%  185.92us       101  1.8400us  1.8240us  2.0800us  iter_dfloat_store_int32_stride_stride_kernel
                    0.00%  150.46us       100  1.5040us  1.4720us  2.1440us  iter_int64_store_int32_stride_stride_kernel
                    0.00%  132.38us         8  16.548us  1.8560us  62.656us  iter_dfloat_mul_contiguous_kernel
                    0.00%  121.15us        13  9.3190us  1.2160us  30.336us  iter_dfloat_fill_stride_kernel
                    0.00%  69.183us         3  23.061us  22.111us  24.480us  iter_dfloat_le_stride_kernel
                    0.00%  50.400us         2  25.200us  24.000us  26.400us  iter_dfloat_gt_stride_kernel
                    0.00%  35.808us         1  35.808us  35.808us  35.808us  void magma_lds128_dgemm_kernel<bool=0, bool=0, int=5, int=5, int=3, int=3, int=3>(int, int, int, double const *, int, doub
le const *, int, double*, int, int, int, double const *, double const *, double, double, int)
                    0.00%  27.264us         1  27.264us  27.264us  27.264us  void gemmSN_NN_kernel<double, double, double, int=128, int=2, int=4, int=8, int=5, int=4>(cublasGemmSmallNParams<double, d
ouble, double>, double const *, double const *, double, double, int)
                    0.00%  19.008us         9  2.1120us  1.7600us  2.3680us  iter_int64_store_int64_stride_stride_kernel
                    0.00%  16.704us         3  5.5680us  5.1840us  5.8560us  iter_dfloat_store_dfloat_index_stride_kernel
                    0.00%  15.200us         6  2.5330us  2.4640us  2.5920us  [CUDA memset]
                    0.00%  14.528us         1  14.528us  14.528us  14.528us  void dfloat_mean_kernel<thrust::device_ptr<double>>(double, double, double*, unsigned long)
                    0.00%  13.792us        10  1.3790us  1.2160us  1.5040us  [CUDA memcpy HtoD]
                    0.00%  13.088us         3  4.3620us  2.3680us  5.3760us  iter_dfloat_store_bit_stride_stride_kernel
                    0.00%  8.0960us         3  2.6980us  2.4000us  3.2640us  iter_dfloat_div_stride_kernel
                    0.00%  7.3920us         4  1.8480us  1.6320us  2.2400us  iter_int32_store_int32_stride_stride_kernel
                    0.00%  6.4000us         2  3.2000us  2.3680us  4.0320us  iter_dfloat_math_s_exp_stride_stride_kernel
                    0.00%  6.4000us         2  3.2000us  2.1760us  4.2240us  iter_int32_store_uint8_stride_stride_kernel
                    0.00%  5.8240us         3  1.9410us  1.6000us  2.3360us  iter_int32_ne_stride_kernel
                    0.00%  5.2480us         3  1.7490us  1.5360us  2.0480us  iter_int32_fill_stride_kernel
                    0.00%  4.8960us         2  2.4480us  2.1120us  2.7840us  iter_int32_lt_stride_kernel
                    0.00%  4.4480us         2  2.2240us  2.0480us  2.4000us  iter_dfloat_eq_stride_kernel
                    0.00%  4.0640us         2  2.0320us  1.8880us  2.1760us  iter_int32_seq_stride_kernel
                    0.00%  2.0800us         1  2.0800us  2.0800us  2.0800us  iter_int32_store_int32_index_stride_kernel
      API calls:   78.19%  24.0410s   3094192  7.7690us  6.4570us  5.9914ms  cudaMemcpyAsync
                   14.11%  4.33879s   6188509     701ns     469ns  2.1052ms  cudaPointerGetAttributes
                    3.34%  1.02553s   6188519     165ns     130ns  478.59us  cudaGetLastError
                    1.51%  463.43ms       364  1.2732ms  1.2710us  405.14ms  cudaFree
                    1.19%  366.86ms       612  599.44us  12.044us  240.82ms  cudaMallocManaged
                    1.18%  363.35ms         1  363.35ms  363.35ms  363.35ms  cuCtxCreate
                    0.36%  111.85ms      3804  29.404us     509ns  15.718ms  cudaLaunch
                    0.05%  16.697ms      1760  9.4860us  1.5250us  2.9453ms  cudaDeviceSynchronize
                    0.02%  7.0239ms       740  9.4910us     123ns  688.73us  cuDeviceGetAttribute
                    0.02%  5.5111ms        30  183.70us  10.692us  459.61us  cudaMalloc
                    0.01%  2.2434ms     15224     147ns      97ns  146.51us  cudaSetupArgument
                    0.01%  2.0317ms         8  253.97us  186.36us  317.24us  cuDeviceTotalMem
                    0.00%  1.3773ms      3804     362ns     164ns  459.81us  cudaConfigureCall
                    0.00%  561.82us         8  70.227us  53.471us  102.74us  cuDeviceGetName
                    0.00%  237.91us        10  23.790us  15.603us  33.382us  cudaMemcpy
                    0.00%  139.14us       160     869ns     408ns  4.0640us  cudaEventCreateWithFlags
                    0.00%  138.29us        20  6.9140us  4.3650us  12.981us  cudaThreadSynchronize
                    0.00%  96.125us       160     600ns     372ns  7.1420us  cudaEventDestroy
                    0.00%  81.367us         6  13.561us  10.848us  17.787us  cudaMemsetAsync
                    0.00%  57.892us       110     526ns     248ns  3.0910us  cudaDeviceGetAttribute
                    0.00%  30.417us        10  3.0410us  1.0000us  9.5560us  cudaGetDevice
                    0.00%  25.392us         6  4.2320us  3.5180us  5.3750us  cudaEventQuery
                    0.00%  21.408us         6  3.5680us  2.5130us  4.3610us  cudaEventRecord
                    0.00%  13.084us         2  6.5420us  3.1640us  9.9200us  cudaBindTexture
                    0.00%  7.1060us        13     546ns     199ns  1.2110us  cuDeviceGet
                    0.00%  3.2800us         4     820ns     285ns  2.0430us  cuDeviceGetCount
                    0.00%  2.7760us         2  1.3880us     758ns  2.0180us  cuDriverGetVersion
                    0.00%  2.5980us         2  1.2990us  1.1120us  1.4860us  cuInit
                    0.00%  2.4180us         2  1.2090us     628ns  1.7900us  cudaUnbindTexture

     API calls:   78.19%  24.0410s   3094192  7.7690us  6.4570us  5.9914ms  cudaMemcpyAsync
                   14.11%  4.33879s   6188509     701ns     469ns  2.1052ms  cudaPointerGetAttributes
                    3.34%  1.02553s   6188519     165ns     130ns  478.59us  cudaGetLastError

are very slow (amazingly, cudaGetLastError is also slow).

They are all coming from buffering mechanism of numo framework at ndloop.c.

cumo/ext/cumo/narray/ndloop.c

Line 1137 in 2cfa97c

    
           cumo_cuda_runtime_check_status(cudaMemcpyAsync(buf,src,elmsz,cudaMemcpyDeviceToDevice,0));

#1  0x00007ffff37eca2d in ndloop_copy_to_buffer (lp=0x555556aa86c0) at narray/ndloop.c:1138
#2  0x00007ffff37ed391 in loop_narray (nf=0x7fffffffad20, lp=0x7fffffffaa70) at narray/ndloop.c:1361
#3  0x00007ffff37ed28a in ndloop_run (vlp=140737488333424) at narray/ndloop.c:1325
#4  0x000055555557aea8 in rb_ensure ()
#5  0x00007ffff37ed964 in na_ndloop_main (nf=0x7fffffffad20, args=93824999531120, opt_ptr=0x7fffffffad50) at narray/ndloop.c:1437
#6  0x00007ffff37edcfb in na_ndloop3 (nf=0x7fffffffad20, ptr=0x7fffffffad50, argc=3) at narray/ndloop.c:1504
#7  0x00007ffff38aae26 in dfloat_gemm (argc=1, argv=0x7ffff7ed83b8, self=93824999518880) at narray/gen/tmpl/gemm.c:151

The copy happens in gemm computation.

To avoid such copy, we should avoid using ndloop, and should use stridedBatchedGemm https://devblogs.nvidia.com/cublas-strided-batched-matrix-multiply/.

a = Cumo::SFloat.new(3, 4).seq(0)
b = Cumo::SFloat.new(3, 4).seq(0)
a.gemm(b.transpose)

This operates 12 times memcpy. If another array is not transposed, this thing does not happen.

Issued to numo, too ruby-numo/numo-narray#95