JuliaLinearAlgebra/MultiThreadedLU.jl

Memory profiling

ViralBShah opened this issue · 1 comments

a = rand(1000,1000);
b = rand(1000);

x=hpl_shared(a, b, 64);
@time x=hpl_shared(a, b, 64);

norm(a*x-b)
        - # This version is written for a shared memory implementation.
        - # The matrix A is local to the first Worker, which allocates work to other Workers
        - # All updates to A are carried out by the first Worker. Thus A is not distributed
        - 
        - using LinearAlgebra
        - 
        - function hpl_shared(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool)
        - 
        0     n = size(A,1)
 16017312     A = [A b]
        - 
        0     if blocksize < 1
        0        throw(ArgumentError("hpl_shared: invalid blocksize: $blocksize < 1"))
        -     end
        - 
      416     B_rows = collect(range(0, n, step=blocksize))
        0     B_rows[end] = n
      640     B_cols = [B_rows; [n+1]]
        0     nB = length(B_rows)
     4352     depend =  Array{Any}(undef, nB, nB)
        - 
        -     ## Small matrix case
        0     if nB <= 1
        0         x = A[1:n, 1:n] \ A[:,n+1]
        0         return x
        -     end
        - 
        -     ## Add a ghost row of dependencies to boostrap the computation
        0     for j=1:nB; depend[1,j] = true; end
        0     for i=2:nB, j=1:nB; depend[i,j] = false; end
        - 
      480     for i=1:(nB-1)
        -         #println("A=$A") #####
        -         ## Threads for panel factorizations
        0         I = (B_rows[i]+1):B_rows[i+1]
        0         K = I[1]:n
     1920         (A_KI, panel_p) = panel_factor_par((@view A[K,I]), depend[i,i])
        - 
        -         ## Write the factorized panel back to A
 17956141         A[K,I] = A_KI
        - 
        -         ## Panel permutation
   137024         panel_p = K[panel_p]
        0         depend[i+1,i] = true
        - 
        -         ## Apply permutation from pivoting
        0         J = (B_cols[i+1]+1):B_cols[nB+1]
 83211136         A[K, J] = @view A[panel_p, J]
        -         ## Threads for trailing updates
        -         #L_II = tril(A[I,I], -1) + LinearAlgebra.I
     2400         L_II = UnitLowerTriangular(@view A[I,I])
        0         K = (I[length(I)]+1):n
     1920         A_KI = @view A[K,I]
        - 
       16         for j=(i+1):nB
       16             J = (B_cols[j]+1):B_cols[j+1]
        - 
        -             ## Do the trailing update (Compute U, and DGEMM - all flops are here)
    32768             if run_parallel
   233024                 A_IJ = @view A[I,J]
        -                 #A_KI = A[K,I]
   298208                 A_KJ = @view A[K,J]
   356480                 depend[i+1,j] = Threads.@spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j])
        -             else
   118864                 depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j])
        -             end
        -         end
        - 
        -         # Wait for all trailing updates to complete, and write back to A
       80         for j=(i+1):nB
        0             J = (B_cols[j]+1):B_cols[j+1]
       80             if run_parallel
   144723                 (A_IJ, A_KJ) = fetch(depend[i+1,j])
        -             else
        0                 (A_IJ, A_KJ) = depend[i+1,j]
        -             end
  1444914             A[I,J] = A_IJ
 62122800             A[K,J] = A_KJ
        0             depend[i+1,j] = true
        -         end
        - 
        -     end
        - 
        -     ## Completion of the last diagonal block signals termination
        0     @assert depend[nB, nB]
        - 
        -     ## Solve the triangular system
 32016576     x = triu(A[1:n,1:n]) \ A[:,n+1]
        - 
        0     return x
        - 
        - end ## hpl()
        - 
        - 
        - ### Panel factorization ###
        - 
        - function panel_factor_par(A_KI, col_dep)
        - 
        -     @assert col_dep
        - 
        -     ## Factorize a panel
        -     panel_p = lu!(A_KI).p
        - 
        -     return (A_KI, panel_p)
        - 
        - end ## panel_factor_par()
        - 
        - 
        - ### Trailing update ###
        - 
        - function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep)
        -     #println(Threads.threadid())
 10031298     @assert row_dep
    98304     @assert col_dep
        - 
        -     ## Compute blocks of U
  6095888     A_IJ = L_II \ A_IJ
        - 
        -     ## Trailing submatrix update - All flops are here
      624     if !isempty(A_KJ)
        -         m, k = size(A_KI)
       16         n = size(A_IJ,2)
    53248         BLAS.gemm!('N','N',-1.0,A_KI,A_IJ,1.0,A_KJ)
        -         #A_KJ = A_KJ - A_KI*A_IJ
        -     end
        - 
     8016     return (A_IJ, A_KJ)
        - 
        - end ## trailing_update_par()
        - 
        - hpl_shared(A::Matrix, b::Vector) = hpl_shared(A, b, max(1, div(maximum(size(A)),4)), true)
        - 
229724738 hpl_shared(A::Matrix, b::Vector, bsize::Integer) = hpl_shared(A, b, bsize, true)
        - 
        - a = rand(1000,1000);
        - b = rand(1000);
        - 
        - x=hpl_shared(a, b, 64);
        - @time x=hpl_shared(a, b, 64);
        - 
        - norm(a*x-b)
        - 

@ph-com

Too many allocating views and asserts. This is not 100% accurate, but removing the asserts doesn't change much.