Use CUDA.randn(nx, ny) instead of cu(randn(nx, ny))

This way of creating a stochastic forcing realization is not very efficient.

GeophysicalFlows.jl/examples/twodnavierstokes_stochasticforcing.jl

Lines 65 to 72 in aa1acb1

    
           function calcF!(Fh, sol, t, clock, vars, params, grid) 
        
             ξ = exp.(2π * im * rand(eltype(grid), size(sol))) / sqrt(clock.dt) 
        
             ξ[1, 1] = 0 
        
             Fh .= ArrayType(dev)(ξ) .* sqrt.(forcing_spectrum) 
        
             return nothing 
        
           end

What we do here is create ξ on CPU and then transfer to GPU if dev=GPU(). What we should do is use CUDA.randn() when on GPU, e.g.,

random_normal = dev==CPU() ? rand : CUDA.rand

 function calcF!(Fh, sol, t, clock, vars, params, grid) 
   ξ = exp.(2π * im * random_normal(eltype(grid), size(sol))) / sqrt(clock.dt) 
   @CUDA.allowscalar ξ[1, 1] = 0
    
   @. Fh = ξ * sqrt(forcing_spectrum)
    
   return nothing 
 end

@glwagner do you think it can be done even more elegantly?

I think you want an algorithm that doesn't allocate memory --- but I don't think broadcasting will work (you may have to write a kernel). I believe the function above allocates a new array ξ every time calcF! is called?

Yeap!
Have a look at this. Last has much less allocation but still some... ideas?

julia> using CUDA, FourierFlows, BenchmarkTools
[ Info: FourierFlows will use 48 threads

julia> dev = CPU();

julia> n, L = 512, 2π;

julia> dt = 0.1
0.1

julia> grid = TwoDGrid(dev, n, L);

julia> Fh = zeros(ComplexF64, size(grid.Krsq));

julia> forcing_spectrum = grid.Krsq;

julia> random_normal = dev==CPU() ? rand : CUDA.rand
rand (generic function with 77 methods)

julia> function calcF!(Fh, grid, dt)
         ξ = exp.(2π * im * rand(eltype(grid), size(grid.Krsq))) / sqrt(dt)
         ξ[1, 1] = 0

         Fh .= ArrayType(dev)(ξ) .* sqrt.(forcing_spectrum)

         return nothing
       end
calcF! (generic function with 1 method)

julia> function calcF_new!(Fh, grid, dt)
         ξ = exp.(2π * im * random_normal(eltype(grid), size(grid.Krsq))) / sqrt(dt)
         @CUDA.allowscalar ξ[1, 1] = 0

         @. Fh = ξ * sqrt(forcing_spectrum)

         return nothing
       end
calcF_new! (generic function with 1 method)

julia> function calcF_noallocation!(Fh, grid, dt)
         Fh .= sqrt.(forcing_spectrum) .* exp(2π * im * random_normal(eltype(grid))) ./ sqrt(dt)

         @CUDA.allowscalar Fh[1, 1] = 0

         return nothing
       end
calcF_noallocation! (generic function with 1 method)

julia> @btime calcF!(Fh, grid, dt)
  4.366 ms (14 allocations: 9.04 MiB)

julia> @btime calcF_new!(Fh, grid, dt)
  3.922 ms (19 allocations: 7.03 MiB)

julia> @btime calcF_noallocation!(Fh, grid, dt)
  341.880 μs (12 allocations: 336 bytes)

julia> versioninfo()
Julia Version 1.6.0
Commit f9720dc2eb (2021-03-24 12:55 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, cascadelake)
Environment:
  JULIA_DEPOT_PATH = /g/data/v45/nc3020/.julia:/share/julia/site/
  JULIA_CUDA_USE_BINARYBUILDER = false
  JULIA_LOAD_PATH = @:@v#.#:@stdlib:@site
  JULIA_NUM_THREADS = 48

I guess the allocations in the last could be thought of ≈0?

	function calcF!(Fh, sol, t, clock, vars, params, grid)
	ξ = exp.(2π * im * rand(eltype(grid), size(sol))) / sqrt(clock.dt)
	ξ[1, 1] = 0

	Fh .= ArrayType(dev)(ξ) .* sqrt.(forcing_spectrum)

	return nothing
	end