JuliaSIMD/LoopVectorization.jl

Suboptimal Choice of the Vecotrization Level for Image Convolution

RoyiAvital opened this issue · 1 comments

The following image convolution code shows a case where the choice of the vecotrization level is not optimized:

using BenchmarkTools;
using LoopVectorization;

function _Conv2DValidA!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}

    numRowsI, numColsI = size(mI);
    numRowsK, numColsK = size(mK);

    for jj  1:(numColsI - numColsK + 1)
        @turbo for ii in 1:(numRowsI - numRowsK + 1)
            sumVal = zero(T);
            for nn  1:numColsK, mm  1:numRowsK
                @inbounds sumVal += mK[mm, nn] * mI[ii - mm + numRowsK, jj - nn + numColsK];
            end
            mO[ii, jj] = sumVal;
        end
    end

end

function _Conv2DValidB!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}

    numRowsI, numColsI = size(mI);
    numRowsK, numColsK = size(mK);

    @turbo for jj  1:(numColsI - numColsK + 1)
        for ii in 1:(numRowsI - numRowsK + 1)
            sumVal = zero(T);
            for nn  1:numColsK, mm  1:numRowsK
                @inbounds sumVal += mK[mm, nn] * mI[ii - mm + numRowsK, jj - nn + numColsK];
            end
            mO[ii, jj] = sumVal;
        end
    end

end

numRowsA = 1000;
numColsA = 1000;

numRowsK = 5;
numColsK = 5;

mA = rand(numRowsA, numColsA);
mK = rand(numRowsK, numColsK);
mO = zeros((numRowsA, numColsA) .- (numRowsK, numColsK) .+ 1);

@benchmark _Conv2DValidA!(mO, mA, mK) #<! @turbo inside
@benchmark _Conv2DValidB!(mO, mA, mK) #<! @turbo outside

It seems like the manual choice of using @turbo as the inner loop (Over ii) improve performance.

On my computer the _Conv2DValidA() is ~10% faster.

@chriselrod 's analysis yield this:

image

His findings were:

image