calyxir/calyx

Unrecognised hardware primitive

Closed this issue · 2 comments

Hi there,

I am trying to lower an MLIR code using CITCT and Calyx using the lowering pipeline below:

$mlir-opt --lower-affine <input>-o <output> 

$mlir-opt --scf-for-to-while <input>-o <output> 

$circt-opt --lower-scf-to-calyx --canonicalize --verify-diagnostics <input> -o <output> 

$circt-translate -export-calyx --verify-diagnostics <input> -o $2/$3_calyx_naive.mlir

$calyx <input> --log info -p validate -p discover-external -p validate -x discover-external:default=DEPTH -x discover-external:strip-suffix=_ -o <output> 

$calyx -l $Calyx_PATH -b verilog --synthesis --disable-verify --nested <input> -o <output> 

$calyx -l $Calyx_PATH -b xilinx --synthesis --disable-verify --nested <input> -o <output> 

$calyx -l $Calyx_PATH -b xilinx-xml --synthesis --disable-verify --nested <input>-o <output> 

The MLIR I am trying to lower is

  func.func @main(%arg0: memref<784xi32, 1>, %arg1: memref<147xi32, 1>, %arg2: memref<3xi32, 1>, %arg3: memref<363xi32, 1>, %arg4: memref<375xi32, 1>, %arg5: memref<5xi32, 1>, %arg6: memref<245xi32, 1>, %arg7: memref<875xi32, 1>, %arg8: memref<7xi32, 1>, %arg9: memref<63xi32, 1>, %arg10: memref<630xi32, 1>, %arg11: memref<10xi32, 1>) {
    %c65536_i64 = arith.constant 65536 : i64
    %c2_i64 = arith.constant 2 : i64
    %c32768_i64 = arith.constant 32768 : i64
    %c0_i32 = arith.constant 0 : i32
    %alloca = memref.alloca() : memref<1xi32>
    %alloca_0 = memref.alloca() : memref<1xi32>
    %alloca_1 = memref.alloca() : memref<1xi32>
    %alloca_2 = memref.alloca() : memref<1xi32>
    %alloca_3 = memref.alloca() : memref<1xi32>
    %alloca_4 = memref.alloca() : memref<1xi32>
    %alloca_5 = memref.alloca() : memref<1xi32>
    %alloca_6 = memref.alloca() : memref<1xi32>
    affine.for %arg12 = 0 to 3 {
      affine.for %arg13 = 0 to 11 {
        affine.for %arg14 = 0 to 11 {
          %0 = affine.for %arg15 = 0 to 7 iter_args(%arg16 = %c0_i32) -> (i32) {
            %7 = affine.for %arg17 = 0 to 7 iter_args(%arg18 = %arg16) -> (i32) {
              %8 = affine.load %arg1[%arg15 * 21 + %arg12 + %arg17 * 3] : memref<147xi32, 1>
              %9 = arith.extsi %8 : i32 to i64
              %10 = affine.load %arg0[(%arg15 + %arg13 * 2) * 28 + %arg14 * 2 + %arg17] : memref<784xi32, 1>
              %11 = arith.extsi %10 : i32 to i64
              %12 = arith.muli %9, %11 : i64
              %13 = arith.divsi %12, %c32768_i64 : i64
              %14 = arith.remsi %13, %c2_i64 : i64
              %15 = arith.divsi %12, %c65536_i64 : i64
              %16 = arith.addi %14, %15 : i64
              %17 = arith.trunci %16 : i64 to i32
              %18 = arith.addi %arg18, %17 : i32
              affine.yield %18 : i32
            }
            affine.yield %7 : i32
          }
          %1 = affine.load %arg2[%arg12] : memref<3xi32, 1>
          %2 = arith.addi %1, %0 : i32
          affine.store %2, %alloca[0] : memref<1xi32>
          affine.store %c0_i32, %alloca_0[0] : memref<1xi32>
          %3 = arith.cmpi sgt, %2, %c0_i32 : i32
          %4 = affine.load %alloca[0] : memref<1xi32>
          %5 = affine.load %alloca_0[0] : memref<1xi32>
          %6 = arith.select %3, %4, %5 : i32
          affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
        }
      }
    }
    affine.for %arg12 = 0 to 5 {
      affine.for %arg13 = 0 to 7 {
        affine.for %arg14 = 0 to 7 {
          %0 = affine.for %arg15 = 0 to 5 iter_args(%arg16 = %c0_i32) -> (i32) {
            %7 = affine.for %arg17 = 0 to 5 iter_args(%arg18 = %arg16) -> (i32) {
              %8 = affine.for %arg19 = 0 to 3 iter_args(%arg20 = %arg18) -> (i32) {
                %9 = affine.load %arg4[%arg15 * 75 + %arg12 + %arg17 * 15 + %arg19 * 5] : memref<375xi32, 1>
                %10 = arith.extsi %9 : i32 to i64
                %11 = affine.load %arg3[(%arg17 + %arg14) * 3 + (%arg15 + %arg13) * 33 + %arg19] : memref<363xi32, 1>
                %12 = arith.extsi %11 : i32 to i64
                %13 = arith.muli %10, %12 : i64
                %14 = arith.divsi %13, %c32768_i64 : i64
                %15 = arith.remsi %14, %c2_i64 : i64
                %16 = arith.divsi %13, %c65536_i64 : i64
                %17 = arith.addi %15, %16 : i64
                %18 = arith.trunci %17 : i64 to i32
                %19 = arith.addi %arg20, %18 : i32
                affine.yield %19 : i32
              }
              affine.yield %8 : i32
            }
            affine.yield %7 : i32
          }
          %1 = affine.load %arg5[%arg12] : memref<5xi32, 1>
          %2 = arith.addi %1, %0 : i32
          affine.store %2, %alloca_1[0] : memref<1xi32>
          affine.store %c0_i32, %alloca_2[0] : memref<1xi32>
          %3 = arith.cmpi sgt, %2, %c0_i32 : i32
          %4 = affine.load %alloca_1[0] : memref<1xi32>
          %5 = affine.load %alloca_2[0] : memref<1xi32>
          %6 = arith.select %3, %4, %5 : i32
          affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
        }
      }
    }
    affine.for %arg12 = 0 to 7 {
      affine.for %arg13 = 0 to 3 {
        affine.for %arg14 = 0 to 3 {
          %0 = affine.for %arg15 = 0 to 5 iter_args(%arg16 = %c0_i32) -> (i32) {
            %7 = affine.for %arg17 = 0 to 5 iter_args(%arg18 = %arg16) -> (i32) {
              %8 = affine.for %arg19 = 0 to 5 iter_args(%arg20 = %arg18) -> (i32) {
                %9 = affine.load %arg7[%arg15 * 175 + %arg12 + %arg17 * 35 + %arg19 * 7] : memref<875xi32, 1>
                %10 = arith.extsi %9 : i32 to i64
                %11 = affine.load %arg6[(%arg17 + %arg14) * 5 + (%arg15 + %arg13) * 35 + %arg19] : memref<245xi32, 1>
                %12 = arith.extsi %11 : i32 to i64
                %13 = arith.muli %10, %12 : i64
                %14 = arith.divsi %13, %c32768_i64 : i64
                %15 = arith.remsi %14, %c2_i64 : i64
                %16 = arith.divsi %13, %c65536_i64 : i64
                %17 = arith.addi %15, %16 : i64
                %18 = arith.trunci %17 : i64 to i32
                %19 = arith.addi %arg20, %18 : i32
                affine.yield %19 : i32
              }
              affine.yield %8 : i32
            }
            affine.yield %7 : i32
          }
          %1 = affine.load %arg8[%arg12] : memref<7xi32, 1>
          %2 = arith.addi %1, %0 : i32
          affine.store %2, %alloca_3[0] : memref<1xi32>
          affine.store %c0_i32, %alloca_4[0] : memref<1xi32>
          %3 = arith.cmpi sgt, %2, %c0_i32 : i32
          %4 = affine.load %alloca_3[0] : memref<1xi32>
          %5 = affine.load %alloca_4[0] : memref<1xi32>
          %6 = arith.select %3, %4, %5 : i32
          affine.store %6, %arg3[%arg13 * 33 + %arg12 + %arg14 * 3] : memref<363xi32, 1>
        }
      }
    }
    affine.for %arg12 = 0 to 10 {
      %0 = affine.for %arg13 = 0 to 3 iter_args(%arg14 = %c0_i32) -> (i32) {
        %7 = affine.for %arg15 = 0 to 3 iter_args(%arg16 = %arg14) -> (i32) {
          %8 = affine.for %arg17 = 0 to 7 iter_args(%arg18 = %arg16) -> (i32) {
            %9 = affine.load %arg10[%arg13 * 210 + %arg12 + %arg15 * 70 + %arg17 * 10] : memref<630xi32, 1>
            %10 = arith.extsi %9 : i32 to i64
            %11 = affine.load %arg9[%arg15 * 7 + %arg13 * 21 + %arg17] : memref<63xi32, 1>
            %12 = arith.extsi %11 : i32 to i64
            %13 = arith.muli %10, %12 : i64
            %14 = arith.divsi %13, %c32768_i64 : i64
            %15 = arith.remsi %14, %c2_i64 : i64
            %16 = arith.divsi %13, %c65536_i64 : i64
            %17 = arith.addi %15, %16 : i64
            %18 = arith.trunci %17 : i64 to i32
            %19 = arith.addi %arg18, %18 : i32
            affine.yield %19 : i32
          }
          affine.yield %8 : i32
        }
        affine.yield %7 : i32
      }
      %1 = affine.load %arg11[%arg12] : memref<10xi32, 1>
      %2 = arith.addi %1, %0 : i32
      affine.store %2, %alloca_5[0] : memref<1xi32>
      affine.store %c0_i32, %alloca_6[0] : memref<1xi32>
      %3 = arith.cmpi sgt, %2, %c0_i32 : i32
      %4 = affine.load %alloca_5[0] : memref<1xi32>
      %5 = affine.load %alloca_6[0] : memref<1xi32>
      %6 = arith.select %3, %4, %5 : i32
      affine.store %6, %arg3[%arg12] : memref<363xi32, 1>
    }
    return
  }

The error complaining that std_mux.sel is not acceptable when it comes to the command

$calyx <input> --log info -p validate -p discover-external -p validate -x discover-external:default=DEPTH -x discover-external:strip-suffix=_ -o <output> 

While looking at the hardware primitive files, std_mux does not contain a sel port, but has a cond port. I am not sure if they are equivalent. I know sel port is from ... = arith.select ... in the MLIR. And I notice CIRCT has an example converting arith.select to std_mux_0.sel. But it cannot be lowered in Calyx?

Looking forward to your reply. Many thank!

Looks like the the primitive definition in CIRCT uses sel while the calyx primitive uses the cond port. Should be an easy fix in the CIRCT repo.

@zzy666666zzy should be fixed on the CIRCT side now!