JuliaGPU/Metal.jl

Error with Julia 1.10

Closed this issue · 1 comments

I am following the example in the README. I am using an Intel MacBook Pro (unsupported, I know). The example works with Julia 1.9 but breaks with Julia 1.10. The backtrace is

julia> using Metal

julia> function vadd(a, b, c)
           i = thread_position_in_grid_1d()
           c[i] = a[i] + b[i]
           return
       end
vadd (generic function with 1 method)

julia> a = MtlArray([1,1,1,1]); b = MtlArray([2,2,2,2]); c = similar(a);
┌ Warning: Metal.jl is only supported on M-series Macs, you may run into issues.
│ See https://github.com/JuliaGPU/Metal.jl/issues/22 for more details.
└ @ Metal ~/.julia/packages/Metal/lnkVP/src/state.jl:14

julia> @metal threads=2 groups=2 vadd(a, b, c)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/gb/x5lhfpj15ln66g6br549tz5m0000gs/T/jl_uPobCnqvOh.metallib.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:78
  [3] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:132
  [5] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(vadd), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(vadd), tt::Type{Tuple{MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77
 [11] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57

caused by: NSError: SC compilation failure
There is a call to an undefined label (CompilerError, code 2)
Stacktrace:
  [1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
    @ Metal.MTL ~/.julia/packages/Metal/lnkVP/lib/mtl/compute_pipeline.jl:60
  [2] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:70
  [3] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:132
  [5] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2mJjc/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(vadd), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(vadd), tt::Type{Tuple{MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}, MtlDeviceVector{Int64, 1}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77
 [11] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
Some type information was truncated. Use `show(err)` to see complete types.

jl_uPobCnqvOh.metallib.gz

I'd like to help out. Do you have a pointer to where I could start looking? Could it e.g. be an incompatibility between LLVM versions?

Julia 1.10:

source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.2.1"

; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #0

define internal void @gpu_report_exception() unnamed_addr {
top:
  ret void
}

define internal void @gpu_signal_exception() unnamed_addr {
top:
  ret void
}

define void @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
  %3 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
  %.unpack10.unpack = load i64, i64 addrspace(1)* %3, align 8
  %4 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1 to i64 addrspace(1)* addrspace(1)*
  %.unpack16 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %4, align 8
  %5 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2 to i64 addrspace(1)* addrspace(1)*
  %.unpack20 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %5, align 8
  %6 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i64 0, i32 1, i64 0
  %.unpack18.unpack = load i64, i64 addrspace(1)* %6, align 8
  %7 = add i32 %thread_position_in_grid, 1
  %8 = call i64 @air.max.s64(i64 %.unpack10.unpack, i64 0)
  %9 = icmp eq i32 %7, 0
  %10 = zext i32 %7 to i64
  %11 = icmp ult i64 %8, %10
  %12 = or i1 %9, %11
  br i1 %12, label %L20, label %L23

L20:                                              ; preds = %conversion
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L23:                                              ; preds = %conversion
  %13 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
  %.unpack14.unpack = load i64, i64 addrspace(1)* %13, align 8
  %14 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to i64 addrspace(1)* addrspace(1)*
  %.unpack12 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %14, align 8
  %15 = sext i32 %thread_position_in_grid to i64
  %16 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack12, i64 %15
  %17 = load i64, i64 addrspace(1)* %16, align 8, !tbaa !20
  %18 = call i64 @air.max.s64(i64 %.unpack14.unpack, i64 0)
  %.not = icmp ult i64 %18, %10
  br i1 %.not, label %L46, label %L49

L46:                                              ; preds = %L23
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L49:                                              ; preds = %L23
  %19 = call i64 @air.max.s64(i64 %.unpack18.unpack, i64 0)
  %20 = icmp ult i64 %19, %10
  br i1 %20, label %L73, label %L76

L73:                                              ; preds = %L49
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L76:                                              ; preds = %L49
  %21 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack16, i64 %15
  %22 = load i64, i64 addrspace(1)* %21, align 8, !tbaa !20
  %23 = add i64 %22, %17
  %24 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack20, i64 %15
  store i64 %23, i64 addrspace(1)* %24, align 8, !tbaa !20
  ret void
}

declare i64 @air.max.s64(i64, i64) local_unnamed_addr

attributes #0 = { cold noreturn nounwind }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!julia.kernel = !{!9}
!air.kernel = !{!10}
!llvm.ident = !{!17}
!air.version = !{!18}
!air.language_version = !{!19}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 7, !"air.max_device_buffers", i32 31}
!3 = !{i32 7, !"air.max_constant_buffers", i32 31}
!4 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!5 = !{i32 7, !"air.max_textures", i32 128}
!6 = !{i32 7, !"air.max_read_write_textures", i32 8}
!7 = !{i32 7, !"air.max_samplers", i32 16}
!8 = !{i32 2, !"SDK Version", [3 x i32] [i32 14, i32 2, i32 1]}
!9 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE}
!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE, !11, !12}
!11 = !{}
!12 = !{!13, !14, !15, !16}
!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"a"}
!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"b"}
!15 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"c"}
!16 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!17 = !{!"Julia 1.10.0 with Metal.jl"}
!18 = !{i32 3, i32 0, i32 0}
!19 = !{!"Metal", i32 3, i32 0, i32 0}
!20 = !{!21, !21, i64 0, i64 0}
!21 = !{!"custom_tbaa_addrspace(1)", !22, i64 0}
!22 = !{!"custom_tbaa"}

Julia 1.9:

source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.2.1"

; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #0

define internal void @gpu_report_exception() unnamed_addr {
top:
  ret void
}

define internal void @gpu_signal_exception() unnamed_addr {
top:
  ret void
}

define void @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
  %3 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
  %.unpack13.unpack = load i64, i64 addrspace(1)* %3, align 8
  %4 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1 to i64 addrspace(1)* addrspace(1)*
  %.unpack19 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %4, align 8
  %5 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0
  %.unpack17.unpack = load i64, i64 addrspace(1)* %5, align 8
  %6 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2 to i64 addrspace(1)* addrspace(1)*
  %.unpack23 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %6, align 8
  %7 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %2, i64 0, i32 1, i64 0
  %.unpack21.unpack = load i64, i64 addrspace(1)* %7, align 8
  %8 = add i32 %thread_position_in_grid, 1
  %9 = icmp sgt i64 %.unpack13.unpack, 0
  %10 = select i1 %9, i64 %.unpack13.unpack, i64 0
  %11 = icmp eq i32 %8, 0
  %12 = zext i32 %8 to i64
  %13 = icmp ult i64 %10, %12
  %14 = or i1 %11, %13
  br i1 %14, label %L20, label %L23

L20:                                              ; preds = %conversion
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L23:                                              ; preds = %conversion
  %15 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to i64 addrspace(1)* addrspace(1)*
  %.unpack15 = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %15, align 8
  %16 = sext i32 %thread_position_in_grid to i64
  %17 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack15, i64 %16
  %18 = load i64, i64 addrspace(1)* %17, align 8, !tbaa !20
  %19 = icmp sgt i64 %.unpack17.unpack, 0
  %20 = select i1 %19, i64 %.unpack17.unpack, i64 0
  %.not = icmp ult i64 %20, %12
  br i1 %.not, label %L46, label %L49

L46:                                              ; preds = %L23
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L49:                                              ; preds = %L23
  %21 = icmp sgt i64 %.unpack21.unpack, 0
  %22 = select i1 %21, i64 %.unpack21.unpack, i64 0
  %23 = icmp ult i64 %22, %12
  br i1 %23, label %L73, label %L76

L73:                                              ; preds = %L49
  call void @gpu_report_exception()
  call void @gpu_signal_exception()
  call void @llvm.trap()
  unreachable

L76:                                              ; preds = %L49
  %24 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack19, i64 %16
  %25 = load i64, i64 addrspace(1)* %24, align 8, !tbaa !20
  %26 = add i64 %25, %18
  %27 = getelementptr inbounds i64, i64 addrspace(1)* %.unpack23, i64 %16
  store i64 %26, i64 addrspace(1)* %27, align 8, !tbaa !20
  ret void
}

attributes #0 = { cold noreturn nounwind }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!julia.kernel = !{!9}
!air.kernel = !{!10}
!llvm.ident = !{!17}
!air.version = !{!18}
!air.language_version = !{!19}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 7, !"air.max_device_buffers", i32 31}
!3 = !{i32 7, !"air.max_constant_buffers", i32 31}
!4 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!5 = !{i32 7, !"air.max_textures", i32 128}
!6 = !{i32 7, !"air.max_read_write_textures", i32 8}
!7 = !{i32 7, !"air.max_samplers", i32 16}
!8 = !{i32 2, !"SDK Version", [3 x i32] [i32 14, i32 2, i32 1]}
!9 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE}
!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @_Z4vadd14MtlDeviceArrayI5Int64Li1ELi1EES_IS0_Li1ELi1EES_IS0_Li1ELi1EE, !11, !12}
!11 = !{}
!12 = !{!13, !14, !15, !16}
!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"a"}
!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"b"}
!15 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int64, 1}", !"air.arg_name", !"c"}
!16 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!17 = !{!"Julia 1.9.4 with Metal.jl"}
!18 = !{i32 3, i32 0, i32 0}
!19 = !{!"Metal", i32 3, i32 0, i32 0}
!20 = !{!21, !21, i64 0, i64 0}
!21 = !{!"custom_tbaa_addrspace(1)", !22, i64 0}
!22 = !{!"custom_tbaa"}

The main difference is a call to @air.max.s64. Shouldn't the correct intrinsic be air.max.s.i64"?

=> Bug seems to be inside of the intrinsic lowering logic in GPUCompiler.jl
https://github.com/JuliaGPU/GPUCompiler.jl/blob/111685fc4fe692c2d632e9f9e3be938a8c1ff768/src/metal.jl#L897

Most interestingly the @air.max.s64 intrinsic does still work on M-series chips. Final AGX is the same though.