
Floating point arithmetic issue related to reduction loop

For program with reduction loop, the computation result using float32 has uncertainty across different runs.

To Repeat the Issue

Below is an example of hcl.sum() that sums up a two elements in a vector.

import ctypes
import numpy as np
import os

import mlir.all_passes_registration

from mlir import ir
from mlir import runtime as rt
from mlir import execution_engine
from mlir import passmanager

from mlir.dialects import builtin

class Compiler:
    def __init__(self):
        self.pipeline = (
    def __call__(self, module: ir.Module):

def code():
    return f"""
module  {{ "private" @gv0 : memref<2xf32> = dense<[1.0, 2.0]>
  func @top(%arg0: memref<2xf32>) -> memref<1xf32> {{
    //%0 = hcl.create_stage_handle "sum" : !hcl.StageHandle
    %1 = memref.alloc() : memref<1xf32>
    //%2 = hcl.create_loop_handle "_" : !hcl.LoopHandle
    affine.for %arg1 = 0 to 1 {{
      %3 = memref.alloc() : memref<1xf32>
      affine.for %arg2 = 0 to 2 {{
        %5 = affine.load %arg0[%arg2] : memref<2xf32>
        %c0_0 = constant 0 : index
        %6 = affine.load %3[%c0_0] : memref<1xf32>
        %7 = addf %5, %6 : f32 %7, %3[%c0_0] : memref<1xf32>
      }} {{loop_name = "x"}}
      %c0 = constant 0 : index
      %4 = affine.load %3[%c0] : memref<1xf32> %4, %1[%arg1] : memref<1xf32>
    }} {{loop_name = "_", stage_name = "sum"}}
    return %1 : memref<1xf32>

  func @main(%0 : memref<2xf32>) -> memref<1xf32> attributes {{ llvm.emit_c_interface }} {{
    %1 = call @top(%0) : (memref<2xf32>) -> (memref<1xf32>)
    %U = memref.cast %1 : memref<1xf32> to memref<*xf32>
    call @print_memref_f32(%U) : (memref<*xf32>) -> ()
    return %1 : memref<1xf32>
  func private @print_memref_f32(memref<*xf32>) attributes {{ llvm.emit_c_interface }}

def main():
  # Change these support library path to yours 
  # We need these support libraries for @print_memref_f32
    support_lib = [

    with ir.Context() as ctx, ir.Location.unknown():
        compiler = Compiler()
        module = ir.Module.parse(code())
        engine = execution_engine.ExecutionEngine(module, opt_level=0, shared_libs=support_lib)
        a = np.array([1, 2], np.float32)
        b = np.zeros((1,), np.float32)
        mem_a  = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(a)))
        mem_b = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(b)))
        engine.invoke('main', mem_b, mem_a)
        out = rt.ranked_memref_to_numpy(mem_b[0])

if __name__ == "__main__":

Across multiple runs, the result printed out is:

(mlir) $ python issues/ 
Unranked Memref base@ = 0x562199587810 rank = 1 offset = 0 sizes = [1] strides = [1] data = 
(mlir) $ python issues/ 
Unranked Memref base@ = 0x5619f1bb3f20 rank = 1 offset = 0 sizes = [1] strides = [1] data = 
(mlir) $ python issues/ 
Unranked Memref base@ = 0x55ff3a092ab0 rank = 1 offset = 0 sizes = [1] strides = [1] data = 

This issue is caused by the single-element memref above reduction loop not initialized to zero. Fixed by d4ce005