Error with Julia 1.10
Closed this issue · 1 comments
eschnett commented
I am following the example in the README. I am using an Intel MacBook Pro (unsupported, I know). The example works with Julia 1.9 but breaks with Julia 1.10. The backtrace is
julia> using Metal
julia> function vadd(a, b, c)
i = thread_position_in_grid_1d()
c[i] = a[i] + b[i]
return
end
vadd (generic function with 1 method)
julia> a = MtlArray([1,1,1,1]); b = MtlArray([2,2,2,2]); c = similar(a);
┌ Warning: Metal.jl is only supported on M-series Macs, you may run into issues.
│ See https://github.com/JuliaGPU/Metal.jl/issues/22 for more details.
└ @ Metal ~/.julia/packages/Metal/lnkVP/src/state.jl:14
julia> @metal threads=2 groups=2 vadd(a, b, c)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/gb/x5lhfpj15ln66g6br549tz5m0000gs/T/jl_uPobCnqvOh.metallib.
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:78
[3] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
[4] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:132
[5] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:103
[6] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
[7] macro expansion
@ ./lock.jl:267 [inlined]
[8] mtlfunction(f::typeof(vadd), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
[9] mtlfunction(f::typeof(vadd), tt::Type{Tuple{MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
[10] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77
[11] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
caused by: NSError: SC compilation failure
There is a call to an undefined label (CompilerError, code 2)
Stacktrace:
[1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
@ Metal.MTL ~/.julia/packages/Metal/lnkVP/lib/mtl/compute_pipeline.jl:60
[2] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:70
[3] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
[4] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:132
[5] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:103
[6] macro expansion
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
[7] macro expansion
@ ./lock.jl:267 [inlined]
[8] mtlfunction(f::typeof(vadd), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
[9] mtlfunction(f::typeof(vadd), tt::Type{Tuple{MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}}})
@ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
[10] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77
[11] top-level scope
@ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
Some type information was truncated. Use `show(err)` to see complete types.
I'd like to help out. Do you have a pointer to where I could start looking? Could it e.g. be an incompatibility between LLVM versions?
tgymnich commented
Julia 1.10:
source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.2.1"
; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #0
define internal void @gpu_report_exception() unnamed_addr {
top:
ret void
}
define internal void @gpu_signal_exception() unnamed_addr {
top:
ret void
}
define void @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
%3 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
%.unpack10.unpack = load i64, i64 addrspace(1)* %3, align 8
%4 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1 to i64 addrspace(1)* addrspace(1)*
%.unpack16 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %4, align 8
%5 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2 to i64 addrspace(1)* addrspace(1)*
%.unpack20 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %5, align 8
%6 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i64 0, i32 1, i64 0
%.unpack18.unpack = load i64, i64 addrspace(1)* %6, align 8
%7 = add i32 %thread_position_in_grid, 1
%8 = call i64 @air.max.s64(i64 %.unpack10.unpack, i64 0)
%9 = icmp eq i32 %7, 0
%10 = zext i32 %7 to i64
%11 = icmp ult i64 %8, %10
%12 = or i1 %9, %11
br i1 %12, label %L20, label %L23
L20: ; preds = %conversion
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L23: ; preds = %conversion
%13 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
%.unpack14.unpack = load i64, i64 addrspace(1)* %13, align 8
%14 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to i64 addrspace(1)* addrspace(1)*
%.unpack12 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %14, align 8
%15 = sext i32 %thread_position_in_grid to i64
%16 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack12, i64 %15
%17 = load i64, i64 addrspace(1)* %16, align 8, !tbaa !20
%18 = call i64 @air.max.s64(i64 %.unpack14.unpack, i64 0)
%.not = icmp ult i64 %18, %10
br i1 %.not, label %L46, label %L49
L46: ; preds = %L23
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L49: ; preds = %L23
%19 = call i64 @air.max.s64(i64 %.unpack18.unpack, i64 0)
%20 = icmp ult i64 %19, %10
br i1 %20, label %L73, label %L76
L73: ; preds = %L49
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L76: ; preds = %L49
%21 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack16, i64 %15
%22 = load i64, i64 addrspace(1)* %21, align 8, !tbaa !20
%23 = add i64 %22, %17
%24 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack20, i64 %15
store i64 %23, i64 addrspace(1)* %24, align 8, !tbaa !20
ret void
}
declare i64 @air.max.s64(i64, i64) local_unnamed_addr
attributes #0 = { cold noreturn nounwind }
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!julia.kernel = !{!9}
!air.kernel = !{!10}
!llvm.ident = !{!17}
!air.version = !{!18}
!air.language_version = !{!19}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 7, !"air.max_device_buffers", i32 31}
!3 = !{i32 7, !"air.max_constant_buffers", i32 31}
!4 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!5 = !{i32 7, !"air.max_textures", i32 128}
!6 = !{i32 7, !"air.max_read_write_textures", i32 8}
!7 = !{i32 7, !"air.max_samplers", i32 16}
!8 = !{i32 2, !"SDK Version", [3 x i32] [i32 14, i32 2, i32 1]}
!9 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE}
!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE, !11, !12}
!11 = !{}
!12 = !{!13, !14, !15, !16}
!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"a"}
!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"b"}
!15 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"c"}
!16 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!17 = !{!"Julia 1.10.0 with Metal.jl"}
!18 = !{i32 3, i32 0, i32 0}
!19 = !{!"Metal", i32 3, i32 0, i32 0}
!20 = !{!21, !21, i64 0, i64 0}
!21 = !{!"custom_tbaa_addrspace(1)", !22, i64 0}
!22 = !{!"custom_tbaa"}
Julia 1.9:
source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.2.1"
; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #0
define internal void @gpu_report_exception() unnamed_addr {
top:
ret void
}
define internal void @gpu_signal_exception() unnamed_addr {
top:
ret void
}
define void @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
%3 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
%.unpack13.unpack = load i64, i64 addrspace(1)* %3, align 8
%4 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1 to i64 addrspace(1)* addrspace(1)*
%.unpack19 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %4, align 8
%5 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
%.unpack17.unpack = load i64, i64 addrspace(1)* %5, align 8
%6 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2 to i64 addrspace(1)* addrspace(1)*
%.unpack23 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %6, align 8
%7 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i64 0, i32 1, i64 0
%.unpack21.unpack = load i64, i64 addrspace(1)* %7, align 8
%8 = add i32 %thread_position_in_grid, 1
%9 = icmp sgt i64 %.unpack13.unpack, 0
%10 = select i1 %9, i64 %.unpack13.unpack, i64 0
%11 = icmp eq i32 %8, 0
%12 = zext i32 %8 to i64
%13 = icmp ult i64 %10, %12
%14 = or i1 %11, %13
br i1 %14, label %L20, label %L23
L20: ; preds = %conversion
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L23: ; preds = %conversion
%15 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to i64 addrspace(1)* addrspace(1)*
%.unpack15 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %15, align 8
%16 = sext i32 %thread_position_in_grid to i64
%17 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack15, i64 %16
%18 = load i64, i64 addrspace(1)* %17, align 8, !tbaa !20
%19 = icmp sgt i64 %.unpack17.unpack, 0
%20 = select i1 %19, i64 %.unpack17.unpack, i64 0
%.not = icmp ult i64 %20, %12
br i1 %.not, label %L46, label %L49
L46: ; preds = %L23
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L49: ; preds = %L23
%21 = icmp sgt i64 %.unpack21.unpack, 0
%22 = select i1 %21, i64 %.unpack21.unpack, i64 0
%23 = icmp ult i64 %22, %12
br i1 %23, label %L73, label %L76
L73: ; preds = %L49
call void @gpu_report_exception()
call void @gpu_signal_exception()
call void @llvm.trap()
unreachable
L76: ; preds = %L49
%24 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack19, i64 %16
%25 = load i64, i64 addrspace(1)* %24, align 8, !tbaa !20
%26 = add i64 %25, %18
%27 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack23, i64 %16
store i64 %26, i64 addrspace(1)* %27, align 8, !tbaa !20
ret void
}
attributes #0 = { cold noreturn nounwind }
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!julia.kernel = !{!9}
!air.kernel = !{!10}
!llvm.ident = !{!17}
!air.version = !{!18}
!air.language_version = !{!19}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 7, !"air.max_device_buffers", i32 31}
!3 = !{i32 7, !"air.max_constant_buffers", i32 31}
!4 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!5 = !{i32 7, !"air.max_textures", i32 128}
!6 = !{i32 7, !"air.max_read_write_textures", i32 8}
!7 = !{i32 7, !"air.max_samplers", i32 16}
!8 = !{i32 2, !"SDK Version", [3 x i32] [i32 14, i32 2, i32 1]}
!9 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE}
!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE, !11, !12}
!11 = !{}
!12 = !{!13, !14, !15, !16}
!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"a"}
!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"b"}
!15 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"c"}
!16 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!17 = !{!"Julia 1.9.4 with Metal.jl"}
!18 = !{i32 3, i32 0, i32 0}
!19 = !{!"Metal", i32 3, i32 0, i32 0}
!20 = !{!21, !21, i64 0, i64 0}
!21 = !{!"custom_tbaa_addrspace(1)", !22, i64 0}
!22 = !{!"custom_tbaa"}
The main difference is a call to @air.max.s64
. Shouldn't the correct intrinsic be air.max.s.i64"
?
=> Bug seems to be inside of the intrinsic lowering logic in GPUCompiler.jl
https://github.com/JuliaGPU/GPUCompiler.jl/blob/111685fc4fe692c2d632e9f9e3be938a8c1ff768/src/metal.jl#L897
Most interestingly the @air.max.s64
intrinsic does still work on M-series chips. Final AGX is the same though.