Control flow-related miscompilation:
maleadt opened this issue · 1 comments
maleadt commented
MWE from JuliaGPU/GPUArrays.jl#525:
using Metal, KernelAbstractions
function doit!(dest, src)
bc = Broadcast.instantiate(Broadcast.broadcasted(identity, src))
if bc isa Broadcast.Broadcasted
bc = Broadcast.preprocess(dest, bc)
end
# grid-stride kernel
@kernel function map_kernel(dest, bc, nelem)
j = 0
J = @index(Global, Linear)
i = 1
for i in 1:nelem
j += 1
if j <= length(dest)
# XXX: adding @inbounds here works around the miscompilation
J_c = CartesianIndices(axes(bc))[(J-1)*nelem + j]
@inbounds dest[J_c] = bc[J_c]
end
end
end
kernel = map_kernel(get_backend(dest))
kernel(dest, bc, 1; ndrange = length(dest))
return dest
end
function main(n=2)
dst = zeros(Float32, n)
src = ones(Float32, n)
doit!(dst, src)
@show dst
dst = Metal.zeros(Float32, n)
src = Metal.ones(Float32, n)
doit!(dst, src)
@show dst
return
end
Causes invalid results in my M3 Pro, as well as my M1 Pro:
julia> main(2)
dst = Float32[1.0, 1.0]
dst = Float32[0.0, 0.0]
May also be the cause of hard compilation failures seen in https://buildkite.com/julialang/gpuarrays-dot-jl/builds/985#0190ded6-c40b-4d43-80f6-4785d8a6a8bf, which I cannot reproduce.
Doesn't happen with 1-sized inputs. Also doesn't happen when adding @inbounds
to the only remaining exception block.
maleadt commented
Seems fixed indeed, thanks for confirming @christiangnrd!