Suboptimal Choice of the Vecotrization Level for Image Convolution
RoyiAvital opened this issue · 1 comments
RoyiAvital commented
The following image convolution code shows a case where the choice of the vecotrization level is not optimized:
using BenchmarkTools;
using LoopVectorization;
function _Conv2DValidA!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}
numRowsI, numColsI = size(mI);
numRowsK, numColsK = size(mK);
for jj ∈ 1:(numColsI - numColsK + 1)
@turbo for ii in 1:(numRowsI - numRowsK + 1)
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
@inbounds sumVal += mK[mm, nn] * mI[ii - mm + numRowsK, jj - nn + numColsK];
end
mO[ii, jj] = sumVal;
end
end
end
function _Conv2DValidB!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}
numRowsI, numColsI = size(mI);
numRowsK, numColsK = size(mK);
@turbo for jj ∈ 1:(numColsI - numColsK + 1)
for ii in 1:(numRowsI - numRowsK + 1)
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
@inbounds sumVal += mK[mm, nn] * mI[ii - mm + numRowsK, jj - nn + numColsK];
end
mO[ii, jj] = sumVal;
end
end
end
numRowsA = 1000;
numColsA = 1000;
numRowsK = 5;
numColsK = 5;
mA = rand(numRowsA, numColsA);
mK = rand(numRowsK, numColsK);
mO = zeros((numRowsA, numColsA) .- (numRowsK, numColsK) .+ 1);
@benchmark _Conv2DValidA!(mO, mA, mK) #<! @turbo inside
@benchmark _Conv2DValidB!(mO, mA, mK) #<! @turbo outside
It seems like the manual choice of using @turbo
as the inner loop (Over ii
) improve performance.
On my computer the _Conv2DValidA()
is ~10% faster.
RoyiAvital commented