JuliaGPU/Metal.jl

Control flow-related miscompilation:

maleadt opened this issue · 1 comments

MWE from JuliaGPU/GPUArrays.jl#525:

using Metal, KernelAbstractions

function doit!(dest, src)
    bc = Broadcast.instantiate(Broadcast.broadcasted(identity, src))
    if bc isa Broadcast.Broadcasted
        bc = Broadcast.preprocess(dest, bc)
    end

    # grid-stride kernel
    @kernel function map_kernel(dest, bc, nelem)
        j = 0
        J = @index(Global, Linear)
        i = 1
        for i in 1:nelem
            j += 1
            if j <= length(dest)
                # XXX: adding @inbounds here works around the miscompilation
                J_c = CartesianIndices(axes(bc))[(J-1)*nelem + j]
                @inbounds dest[J_c] = bc[J_c]
            end
        end
    end
    kernel = map_kernel(get_backend(dest))
    kernel(dest, bc, 1; ndrange = length(dest))
    return dest
end

function main(n=2)
    dst = zeros(Float32, n)
    src = ones(Float32, n)
    doit!(dst, src)
    @show dst

    dst = Metal.zeros(Float32, n)
    src = Metal.ones(Float32, n)
    doit!(dst, src)
    @show dst

    return
end

Causes invalid results in my M3 Pro, as well as my M1 Pro:

julia> main(2)
dst = Float32[1.0, 1.0]
dst = Float32[0.0, 0.0]

May also be the cause of hard compilation failures seen in https://buildkite.com/julialang/gpuarrays-dot-jl/builds/985#0190ded6-c40b-4d43-80f6-4785d8a6a8bf, which I cannot reproduce.

Doesn't happen with 1-sized inputs. Also doesn't happen when adding @inbounds to the only remaining exception block.

Seems fixed indeed, thanks for confirming @christiangnrd!