Memory profiling
ViralBShah opened this issue · 1 comments
ViralBShah commented
a = rand(1000,1000);
b = rand(1000);
x=hpl_shared(a, b, 64);
@time x=hpl_shared(a, b, 64);
norm(a*x-b)
- # This version is written for a shared memory implementation.
- # The matrix A is local to the first Worker, which allocates work to other Workers
- # All updates to A are carried out by the first Worker. Thus A is not distributed
-
- using LinearAlgebra
-
- function hpl_shared(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool)
-
0 n = size(A,1)
16017312 A = [A b]
-
0 if blocksize < 1
0 throw(ArgumentError("hpl_shared: invalid blocksize: $blocksize < 1"))
- end
-
416 B_rows = collect(range(0, n, step=blocksize))
0 B_rows[end] = n
640 B_cols = [B_rows; [n+1]]
0 nB = length(B_rows)
4352 depend = Array{Any}(undef, nB, nB)
-
- ## Small matrix case
0 if nB <= 1
0 x = A[1:n, 1:n] \ A[:,n+1]
0 return x
- end
-
- ## Add a ghost row of dependencies to boostrap the computation
0 for j=1:nB; depend[1,j] = true; end
0 for i=2:nB, j=1:nB; depend[i,j] = false; end
-
480 for i=1:(nB-1)
- #println("A=$A") #####
- ## Threads for panel factorizations
0 I = (B_rows[i]+1):B_rows[i+1]
0 K = I[1]:n
1920 (A_KI, panel_p) = panel_factor_par((@view A[K,I]), depend[i,i])
-
- ## Write the factorized panel back to A
17956141 A[K,I] = A_KI
-
- ## Panel permutation
137024 panel_p = K[panel_p]
0 depend[i+1,i] = true
-
- ## Apply permutation from pivoting
0 J = (B_cols[i+1]+1):B_cols[nB+1]
83211136 A[K, J] = @view A[panel_p, J]
- ## Threads for trailing updates
- #L_II = tril(A[I,I], -1) + LinearAlgebra.I
2400 L_II = UnitLowerTriangular(@view A[I,I])
0 K = (I[length(I)]+1):n
1920 A_KI = @view A[K,I]
-
16 for j=(i+1):nB
16 J = (B_cols[j]+1):B_cols[j+1]
-
- ## Do the trailing update (Compute U, and DGEMM - all flops are here)
32768 if run_parallel
233024 A_IJ = @view A[I,J]
- #A_KI = A[K,I]
298208 A_KJ = @view A[K,J]
356480 depend[i+1,j] = Threads.@spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j])
- else
118864 depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j])
- end
- end
-
- # Wait for all trailing updates to complete, and write back to A
80 for j=(i+1):nB
0 J = (B_cols[j]+1):B_cols[j+1]
80 if run_parallel
144723 (A_IJ, A_KJ) = fetch(depend[i+1,j])
- else
0 (A_IJ, A_KJ) = depend[i+1,j]
- end
1444914 A[I,J] = A_IJ
62122800 A[K,J] = A_KJ
0 depend[i+1,j] = true
- end
-
- end
-
- ## Completion of the last diagonal block signals termination
0 @assert depend[nB, nB]
-
- ## Solve the triangular system
32016576 x = triu(A[1:n,1:n]) \ A[:,n+1]
-
0 return x
-
- end ## hpl()
-
-
- ### Panel factorization ###
-
- function panel_factor_par(A_KI, col_dep)
-
- @assert col_dep
-
- ## Factorize a panel
- panel_p = lu!(A_KI).p
-
- return (A_KI, panel_p)
-
- end ## panel_factor_par()
-
-
- ### Trailing update ###
-
- function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep)
- #println(Threads.threadid())
10031298 @assert row_dep
98304 @assert col_dep
-
- ## Compute blocks of U
6095888 A_IJ = L_II \ A_IJ
-
- ## Trailing submatrix update - All flops are here
624 if !isempty(A_KJ)
- m, k = size(A_KI)
16 n = size(A_IJ,2)
53248 BLAS.gemm!('N','N',-1.0,A_KI,A_IJ,1.0,A_KJ)
- #A_KJ = A_KJ - A_KI*A_IJ
- end
-
8016 return (A_IJ, A_KJ)
-
- end ## trailing_update_par()
-
- hpl_shared(A::Matrix, b::Vector) = hpl_shared(A, b, max(1, div(maximum(size(A)),4)), true)
-
229724738 hpl_shared(A::Matrix, b::Vector, bsize::Integer) = hpl_shared(A, b, bsize, true)
-
- a = rand(1000,1000);
- b = rand(1000);
-
- x=hpl_shared(a, b, 64);
- @time x=hpl_shared(a, b, 64);
-
- norm(a*x-b)
-
ViralBShah commented
Too many allocating views and asserts. This is not 100% accurate, but removing the asserts doesn't change much.