Unrecognised hardware primitive
Closed this issue · 2 comments
Hi there,
I am trying to lower an MLIR code using CITCT and Calyx using the lowering pipeline below:
$mlir-opt --lower-affine <input>-o <output>
$mlir-opt --scf-for-to-while <input>-o <output>
$circt-opt --lower-scf-to-calyx --canonicalize --verify-diagnostics <input> -o <output>
$circt-translate -export-calyx --verify-diagnostics <input> -o $2/$3_calyx_naive.mlir
$calyx <input> --log info -p validate -p discover-external -p validate -x discover-external:default=DEPTH -x discover-external:strip-suffix=_ -o <output>
$calyx -l $Calyx_PATH -b verilog --synthesis --disable-verify --nested <input> -o <output>
$calyx -l $Calyx_PATH -b xilinx --synthesis --disable-verify --nested <input> -o <output>
$calyx -l $Calyx_PATH -b xilinx-xml --synthesis --disable-verify --nested <input>-o <output>
The MLIR I am trying to lower is
func.func @main(%arg0: memref<784xi32, 1>, %arg1: memref<147xi32, 1>, %arg2: memref<3xi32, 1>, %arg3: memref<363xi32, 1>, %arg4: memref<375xi32, 1>, %arg5: memref<5xi32, 1>, %arg6: memref<245xi32, 1>, %arg7: memref<875xi32, 1>, %arg8: memref<7xi32, 1>, %arg9: memref<63xi32, 1>, %arg10: memref<630xi32, 1>, %arg11: memref<10xi32, 1>) {
%c65536_i64 = arith.constant 65536 : i64
%c2_i64 = arith.constant 2 : i64
%c32768_i64 = arith.constant 32768 : i64
%c0_i32 = arith.constant 0 : i32
%alloca = memref.alloca() : memref<1xi32>
%alloca_0 = memref.alloca() : memref<1xi32>
%alloca_1 = memref.alloca() : memref<1xi32>
%alloca_2 = memref.alloca() : memref<1xi32>
%alloca_3 = memref.alloca() : memref<1xi32>
%alloca_4 = memref.alloca() : memref<1xi32>
%alloca_5 = memref.alloca() : memref<1xi32>
%alloca_6 = memref.alloca() : memref<1xi32>
affine.for %arg12 = 0 to 3 {
affine.for %arg13 = 0 to 11 {
affine.for %arg14 = 0 to 11 {
%0 = affine.for %arg15 = 0 to 7 iter_args(%arg16 = %c0_i32) -> (i32) {
%7 = affine.for %arg17 = 0 to 7 iter_args(%arg18 = %arg16) -> (i32) {
%8 = affine.load %arg1[%arg15 * 21 + %arg12 + %arg17 * 3] : memref<147xi32, 1>
%9 = arith.extsi %8 : i32 to i64
%10 = affine.load %arg0[(%arg15 + %arg13 * 2) * 28 + %arg14 * 2 + %arg17] : memref<784xi32, 1>
%11 = arith.extsi %10 : i32 to i64
%12 = arith.muli %9, %11 : i64
%13 = arith.divsi %12, %c32768_i64 : i64
%14 = arith.remsi %13, %c2_i64 : i64
%15 = arith.divsi %12, %c65536_i64 : i64
%16 = arith.addi %14, %15 : i64
%17 = arith.trunci %16 : i64 to i32
%18 = arith.addi %arg18, %17 : i32
affine.yield %18 : i32
}
affine.yield %7 : i32
}
%1 = affine.load %arg2[%arg12] : memref<3xi32, 1>
%2 = arith.addi %1, %0 : i32
affine.store %2, %alloca[0] : memref<1xi32>
affine.store %c0_i32, %alloca_0[0] : memref<1xi32>
%3 = arith.cmpi sgt, %2, %c0_i32 : i32
%4 = affine.load %alloca[0] : memref<1xi32>
%5 = affine.load %alloca_0[0] : memref<1xi32>
%6 = arith.select %3, %4, %5 : i32
affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
}
}
}
affine.for %arg12 = 0 to 5 {
affine.for %arg13 = 0 to 7 {
affine.for %arg14 = 0 to 7 {
%0 = affine.for %arg15 = 0 to 5 iter_args(%arg16 = %c0_i32) -> (i32) {
%7 = affine.for %arg17 = 0 to 5 iter_args(%arg18 = %arg16) -> (i32) {
%8 = affine.for %arg19 = 0 to 3 iter_args(%arg20 = %arg18) -> (i32) {
%9 = affine.load %arg4[%arg15 * 75 + %arg12 + %arg17 * 15 + %arg19 * 5] : memref<375xi32, 1>
%10 = arith.extsi %9 : i32 to i64
%11 = affine.load %arg3[(%arg17 + %arg14) * 3 + (%arg15 + %arg13) * 33 + %arg19] : memref<363xi32, 1>
%12 = arith.extsi %11 : i32 to i64
%13 = arith.muli %10, %12 : i64
%14 = arith.divsi %13, %c32768_i64 : i64
%15 = arith.remsi %14, %c2_i64 : i64
%16 = arith.divsi %13, %c65536_i64 : i64
%17 = arith.addi %15, %16 : i64
%18 = arith.trunci %17 : i64 to i32
%19 = arith.addi %arg20, %18 : i32
affine.yield %19 : i32
}
affine.yield %8 : i32
}
affine.yield %7 : i32
}
%1 = affine.load %arg5[%arg12] : memref<5xi32, 1>
%2 = arith.addi %1, %0 : i32
affine.store %2, %alloca_1[0] : memref<1xi32>
affine.store %c0_i32, %alloca_2[0] : memref<1xi32>
%3 = arith.cmpi sgt, %2, %c0_i32 : i32
%4 = affine.load %alloca_1[0] : memref<1xi32>
%5 = affine.load %alloca_2[0] : memref<1xi32>
%6 = arith.select %3, %4, %5 : i32
affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
}
}
}
affine.for %arg12 = 0 to 7 {
affine.for %arg13 = 0 to 3 {
affine.for %arg14 = 0 to 3 {
%0 = affine.for %arg15 = 0 to 5 iter_args(%arg16 = %c0_i32) -> (i32) {
%7 = affine.for %arg17 = 0 to 5 iter_args(%arg18 = %arg16) -> (i32) {
%8 = affine.for %arg19 = 0 to 5 iter_args(%arg20 = %arg18) -> (i32) {
%9 = affine.load %arg7[%arg15 * 175 + %arg12 + %arg17 * 35 + %arg19 * 7] : memref<875xi32, 1>
%10 = arith.extsi %9 : i32 to i64
%11 = affine.load %arg6[(%arg17 + %arg14) * 5 + (%arg15 + %arg13) * 35 + %arg19] : memref<245xi32, 1>
%12 = arith.extsi %11 : i32 to i64
%13 = arith.muli %10, %12 : i64
%14 = arith.divsi %13, %c32768_i64 : i64
%15 = arith.remsi %14, %c2_i64 : i64
%16 = arith.divsi %13, %c65536_i64 : i64
%17 = arith.addi %15, %16 : i64
%18 = arith.trunci %17 : i64 to i32
%19 = arith.addi %arg20, %18 : i32
affine.yield %19 : i32
}
affine.yield %8 : i32
}
affine.yield %7 : i32
}
%1 = affine.load %arg8[%arg12] : memref<7xi32, 1>
%2 = arith.addi %1, %0 : i32
affine.store %2, %alloca_3[0] : memref<1xi32>
affine.store %c0_i32, %alloca_4[0] : memref<1xi32>
%3 = arith.cmpi sgt, %2, %c0_i32 : i32
%4 = affine.load %alloca_3[0] : memref<1xi32>
%5 = affine.load %alloca_4[0] : memref<1xi32>
%6 = arith.select %3, %4, %5 : i32
affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
}
}
}
affine.for %arg12 = 0 to 10 {
%0 = affine.for %arg13 = 0 to 3 iter_args(%arg14 = %c0_i32) -> (i32) {
%7 = affine.for %arg15 = 0 to 3 iter_args(%arg16 = %arg14) -> (i32) {
%8 = affine.for %arg17 = 0 to 7 iter_args(%arg18 = %arg16) -> (i32) {
%9 = affine.load %arg10[%arg13 * 210 + %arg12 + %arg15 * 70 + %arg17 * 10] : memref<630xi32, 1>
%10 = arith.extsi %9 : i32 to i64
%11 = affine.load %arg9[%arg15 * 7 + %arg13 * 21 + %arg17] : memref<63xi32, 1>
%12 = arith.extsi %11 : i32 to i64
%13 = arith.muli %10, %12 : i64
%14 = arith.divsi %13, %c32768_i64 : i64
%15 = arith.remsi %14, %c2_i64 : i64
%16 = arith.divsi %13, %c65536_i64 : i64
%17 = arith.addi %15, %16 : i64
%18 = arith.trunci %17 : i64 to i32
%19 = arith.addi %arg18, %18 : i32
affine.yield %19 : i32
}
affine.yield %8 : i32
}
affine.yield %7 : i32
}
%1 = affine.load %arg11[%arg12] : memref<10xi32, 1>
%2 = arith.addi %1, %0 : i32
affine.store %2, %alloca_5[0] : memref<1xi32>
affine.store %c0_i32, %alloca_6[0] : memref<1xi32>
%3 = arith.cmpi sgt, %2, %c0_i32 : i32
%4 = affine.load %alloca_5[0] : memref<1xi32>
%5 = affine.load %alloca_6[0] : memref<1xi32>
%6 = arith.select %3, %4, %5 : i32
affine.store %6, %arg3[%arg12] : memref<363xi32, 1>
}
return
}
The error complaining that std_mux.sel
is not acceptable when it comes to the command
$calyx <input> --log info -p validate -p discover-external -p validate -x discover-external:default=DEPTH -x discover-external:strip-suffix=_ -o <output>
While looking at the hardware primitive files, std_mux
does not contain a sel
port, but has a cond
port. I am not sure if they are equivalent. I know sel
port is from ... = arith.select ...
in the MLIR. And I notice CIRCT has an example converting arith.select
to std_mux_0.sel
. But it cannot be lowered in Calyx?
Looking forward to your reply. Many thank!
Looks like the the primitive definition in CIRCT uses sel
while the calyx primitive uses the cond
port. Should be an easy fix in the CIRCT repo.
@zzy666666zzy should be fixed on the CIRCT side now!