cornell-zhang/hcl-dialect

[Pass] Incorrect operation sinking in loop funsion

Closed this issue · 0 comments

The following test program generate incorrect code.

def test_schedule_intra_stage():

    hcl.init()
    def popcount(A, B): # each element in A is a 32-bit integer
        with hcl.for_(0, A.shape[0], tag="C") as x:
            with hcl.for_(0, A.shape[1]) as y:
                B[x, y] = 0
                with hcl.for_(0, 32) as i:
                    B[x, y] += A[x, y][i]

    A = hcl.placeholder((10, 20))
    B = hcl.placeholder(A.shape)

    def test_fuse():
        s = hcl.create_schedule([A, B], popcount)
        C = popcount.C
        s[C].fuse(C.axis[0], C.axis[1])
        ir = hcl.lower(s)
        print(ir)
error: operand #2 does not dominate this use
// Verification failed, printing generic form
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0)[s0] -> (d0 mod s0)>
#map2 = affine_map<(d0)[s0] -> (d0 floordiv s0)>
#map3 = affine_map<() -> (0)>
#map4 = affine_map<() -> (32)>
#map5 = affine_map<() -> (200)>
"builtin.module"() ({
  "builtin.func"() ({
  ^bb0(%arg0: memref<10x20xi32>, %arg1: memref<10x20xi32>):
    %0 = "arith.constant"() {value = 10 : index} : () -> index
    %1 = "arith.constant"() {value = 20 : index} : () -> index
    "affine.for"() ({
    ^bb0(%arg2: index):
      %2 = "arith.constant"() {value = 0 : i32} : () -> i32
      "affine.store"(%2, %arg1, %4, %3) {map = #map0, to = "compute_1"} : (i32, memref<10x20xi32>, index, index) -> ()
      "affine.for"() ({
      ^bb0(%arg3: index):
        %3 = "affine.apply"(%arg2, %1) {map = #map1} : (index, index) -> index
        %4 = "affine.apply"(%arg2, %1) {map = #map2} : (index, index) -> index
        %5 = "affine.load"(%arg1, %4, %3) {from = "compute_1", map = #map0} : (memref<10x20xi32>, index, index) -> i32
        %6 = "affine.load"(%arg0, %4, %3) {from = "compute_0", map = #map0} : (memref<10x20xi32>, index, index) -> i32
        %7 = "hcl.get_bit"(%6, %arg3) : (i32, index) -> i1
        %8 = "arith.extui"(%7) : (i1) -> i32
        %9 = "arith.addi"(%5, %8) : (i32, i32) -> i32
        "affine.store"(%9, %arg1, %4, %3) {map = #map0, to = "compute_1"} : (i32, memref<10x20xi32>, index, index) -> ()
        "affine.yield"() : () -> ()
      }) {loop_name = "loop_2", lower_bound = #map3, step = 1 : i32, upper_bound = #map4} : () -> ()
      "affine.yield"() : () -> ()
    }) {loop_name = "loop_0_loop_1_fused", lower_bound = #map3, stage_name = "C", step = 1 : i32, upper_bound = #map5} : () -> ()
    "std.return"() : () -> ()
  }) {bit, extra_itypes = "ss", extra_otypes = "", sym_name = "top", type = (memref<10x20xi32>, memref<10x20xi32>) -> ()} : () -> ()
}) : () -> ()

The first affine.apply operation should not been sunk into the innermost loop.