cornell-zhang/heterocl

Module not executed with option `--lower-composite`

sqPoseidon opened this issue · 0 comments

In some cases, the kernels are not executed by the main module. Not sure which part affects the inference.
I have two kernels: bn1.mlir and relu.mlir. They are called in the main.mlir. I delete the execution part and only print an array.

bn1.mlir:

module {
  memref.global "private" @gv0 : memref<4x4xf32> = dense<[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]>
  func.func private @printMemrefF32(%ptr : memref<*xf32>)
  memref.global "private" constant @weight : memref<64xi8> = dense<1>
  memref.global "private" constant @bias : memref<64xi8> = dense<0>
  func.func @bn1(%arg0: memref<1x64x112x112xf32>) -> memref<1x64x112x112xi8> attributes {itypes = "_", llvm.emit_c_interface, otypes = "s", top} {
    %c0 = arith.constant 0 : index
    %0 = memref.get_global @weight : memref<64xi8>
    %1 = memref.get_global @bias : memref<64xi8>
    %2 = memref.alloc() {name = "BatchNorm2d"} : memref<1x64x112x112xi8>
     %temp = memref.get_global @gv0 : memref<4x4xf32>
     // hcl.print(%0) : memref<4x4xf32>
     %casted = memref.cast %temp : memref<4x4xf32> to memref<*xf32>
     func.call @printMemrefF32(%casted) : (memref<*xf32>) -> ()
    return %2 : memref<1x64x112x112xi8>
  }
}

relu.mlir

module {
  memref.global "private" @gv0 : memref<4x4xf32> = dense<[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]>
  func.func private @printMemrefF32(%ptr : memref<*xf32>)
  func.func @relu(%arg0: memref<1x64x112x112xi8>) -> memref<1x64x112x112xi8> attributes {itypes = "s", llvm.emit_c_interface, otypes = "s", top} {
    %c0 = arith.constant 0 : index
    %0 = memref.alloc() {name = "ReLU"} : memref<1x64x112x112xi8>
     %temp = memref.get_global @gv0 : memref<4x4xf32>
     // hcl.print(%0) : memref<4x4xf32>
     %casted = memref.cast %temp : memref<4x4xf32> to memref<*xf32>
     func.call @printMemrefF32(%casted) : (memref<*xf32>) -> ()
    return %0 : memref<1x64x112x112xi8>
  }
}

main.mlir

module {
  func.func private @bn1(memref<1x64x112x112xf32>) -> memref<1x64x112x112xi8>
  func.func private @relu(memref<1x64x112x112xi8>) -> memref<1x64x112x112xi8>
  func.func @main() {
    %0 = memref.alloc() : memref<1x64x112x112xf32>
    // affine.for %arg = 0 to 20 {
        %1 = func.call @bn1(%0) : (memref<1x64x112x112xf32>) -> memref<1x64x112x112xi8>
        %2 = func.call @relu(%1) : (memref<1x64x112x112xi8>) -> memref<1x64x112x112xi8>
    // }
    return
  }
}

compile.sh

set -x
# export HCL_OPT_FLAGS=" --fixed-to-integer --verify-each --verify-diagnostics --allow-unregistered-dialect --preload-dialects-in-context --lower-bitops --lower-to-llvm"
export HCL_OPT_FLAGS=" --fixed-to-integer --verify-each --verify-diagnostics --allow-unregistered-dialect --preload-dialects-in-context --lower-composite --lower-bitops --lower-to-llvm"
hcl-opt bn1.mlir $HCL_OPT_FLAGS | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -filetype=obj -o bn1.o
hcl-opt relu.mlir $HCL_OPT_FLAGS | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -filetype=obj -o relu.o
hcl-opt main.mlir $HCL_OPT_FLAGS | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -filetype=obj -o main.o
clang main.o bn1.o relu.o --target=x86-64 -fPIE \
    -L/work/shared/users/staff/qs228/MLIR/heterocl-mlir/hcl-dialect/llvm-project/build/lib/ \
    -lmlir_c_runner_utils \
    -lmlir_runner_utils \
    -o x86-app

echo "Compilation Done."
start=$(date +%s%6N)
./x86-app # 2>&1 | tee temp.log
end=$(date +%s%6N)
runtime=`expr "scale=3; $( echo "$end - $start" | bc -l ) / 1000.0" | bc `
echo "X86 Execution Done."
echo Simulation Time: $runtime ms.

If these two modules are executed, you'll see the printed arrays. With option --lower-composite, it doesn't print anything.