seriyps/rebar3_bench

Try to optimize runner's tight inner loop

seriyps opened this issue · 3 comments

%% Inner tight loop
do_run_n(_, _, 0) ->
ok;
do_run_n(F, St, N) ->
F(St),
do_run_n(F, St, N - 1).

Should find if it's possible to somehow minimize the amount of work done by the loop.

On OTP22 seems changing the order of arguments doesn't reduce amount of opcodes in the function, but changes opcodes itself and amount of arguments to those opcodes:

do_run_n(_, _, 0) ->
    ok;
do_run_n(St, F, N) ->
    F(St),
    do_run_n(St, F, N - 1).

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",17}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",18}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007F6EA37CB398: i_func_info_IaaI 0 main do_run_n 3 
00007F6EA37CB3C0: i_is_eq_exact_immed_fxc f(00007F6EA37CB3E8) x(2) 0 
00007F6EA37CB3D8: move_return_c ok 
00007F6EA37CB3E8: allocate_tt 3 3 
00007F6EA37CB3F8: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F6EA37CB408: move_yx y(1) x(1) 
00007F6EA37CB418: i_call_fun_t 1 
00007F6EA37CB428: i_increment_yWd y(0) -1 x(2) 
00007F6EA37CB448: move_src_window2_yxx y(1) x(1) x(0) 
00007F6EA37CB458: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F6EA37CB408: move_yx y(1) x(1) ` touches 2 registers)
========================================


do_run_n(_, _, 0) ->
    ok;
do_run_n(F, St, N) ->
    F(St),
    do_run_n(F, St, N - 1).

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{x,1},{x,0}}.
    {move,{y,2},{x,1}}.
    {line,[{location,"main.erl",17}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",18}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007F64D740B3A0: i_func_info_IaaI 0 main do_run_n 3 
00007F64D740B3C8: i_is_eq_exact_immed_fxc f(00007F64D740B3F0) x(2) 0 
00007F64D740B3E0: move_return_c ok 
00007F64D740B3F0: allocate_tt 3 3 
00007F64D740B400: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F64D740B410: move_shift_yxx y(2) x(1) x(0) 
00007F64D740B420: i_call_fun_t 1 
00007F64D740B430: i_increment_yWd y(0) -1 x(2) 
00007F64D740B450: move_src_window2_yxx y(1) x(1) x(0) 
00007F64D740B460: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F64D740B410: move_shift_yxx y(2) x(1) x(0) ` touches 3 registers)
==========================================

do_run_n(0, _, _) ->
    ok;
do_run_n(N, F, St) ->
    F(St),
    do_run_n(N - 1, F, St).

00007F62A078B3A0: i_func_info_IaaI 0 main do_run_n 3 
00007F62A078B3C8: i_is_eq_exact_immed_frc f(00007F62A078B3F0) r(0) 0 
00007F62A078B3E0: move_return_c ok 
00007F62A078B3F0: allocate_tt 3 3 
00007F62A078B400: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0) 
00007F62A078B420: i_call_fun_t 1 
00007F62A078B430: i_increment_yWd y(2) -1 x(0) 
00007F62A078B450: move_src_window2_yxx y(0) x(2) x(1) 
00007F62A078B460: i_call_last_fQ main:do_run_n/3 3 

( 10 opcodes, `00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0) ` touches 4 registers)

So, the current order of arguments is the optimal one

It might be problematic that less possible clause N == 0 -> ok is executed first. Would be nice to make main body execute first and jump to -> ok if guard fails.

do_run_n(St, F, N) when N =/= 0 ->
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
    ok.

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {'%',{type_info,{x,2},number}}.
    {test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
    {move,{atom,ok},{x,0}}.
    return.
  {label,5}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",15}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",16}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.

00007FBF13D0B398: i_func_info_IaaI 0 main do_run_n 3 
00007FBF13D0B3C0: i_is_eq_exact_immed_fxc f(00007FBF13D0B3E8) x(2) 0 
00007FBF13D0B3D8: move_return_c ok 
00007FBF13D0B3E8: allocate_tt 3 3 
00007FBF13D0B3F8: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007FBF13D0B408: move_yx y(1) x(1) 
00007FBF13D0B418: i_call_fun_t 1 
00007FBF13D0B428: i_increment_yWd y(0) -1 x(2) 
00007FBF13D0B448: move_src_window2_yxx y(1) x(1) x(0) 
00007FBF13D0B458: i_call_last_fQ main:do_run_n/3 3 

So, putting "main" body as 1st clause with N =/= 0 doesn't make any difference on OTP-22. Compiler converts it to the same binary code as original.

do_run_n(St, F, N) when N > 0 ->
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, N) ->
    ok.

{function, do_run_n, 3, 4}.
  {label,3}.
    {line,[{location,"main.erl",14}]}.
    {func_info,{atom,main},{atom,do_run_n},3}.
  {label,4}.
    {test,is_lt,{f,5},[{integer,0},{x,2}]}.
    {allocate,3,3}.
    {move,{x,2},{y,0}}.
    {move,{x,1},{y,1}}.
    {move,{x,0},{y,2}}.
    {move,{y,1},{x,1}}.
    {line,[{location,"main.erl",15}]}.
    {call_fun,1}.
    {line,[{location,"main.erl",16}]}.
    {gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
    {move,{y,1},{x,1}}.
    {move,{y,2},{x,0}}.
    {call_last,3,{f,4},3}.
  {label,5}.
    {move,{atom,ok},{x,0}}.
    return.

00007F6573B0B3D0: i_func_info_IaaI 0 main do_run_n 3 
00007F6573B0B3F8: is_lt_fcx f(00007F6573B0B490) 0 x(2) 
00007F6573B0B410: allocate_tt 3 3 
00007F6573B0B420: move_window3_xxxy x(2) x(1) x(0) y(0) 
00007F6573B0B430: move_yx y(1) x(1) 
00007F6573B0B440: i_call_fun_t 1 
00007F6573B0B450: i_increment_yWd y(0) -1 x(2) 
00007F6573B0B470: move_src_window2_yxx y(1) x(1) x(0) 
00007F6573B0B480: i_call_last_fQ main:do_run_n/3 3 
00007F6573B0B490: move_return_c ok 

http://tryerl.seriyps.ru/#id=78b4

So, if we put N > 0 as 1st clause, it will be executed first. But I don't know if is_lt_fcx (<) is noticeably slower than i_is_eq_exact_immed_frc (=:=)

Last option to consider is to run F multiple times on each iteration loop (N should be div 5 from original N):

do_run_n(St, F, N) when N =/= 0 ->
    F(St),
    F(St),
    F(St),
    F(St),
    F(St),
    do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
    ok.

00007FEA7A30D968: i_func_info_IaaI 0 main do_run_n 3 
00007FEA7A30D990: i_is_ne_exact_immed_fxc f(00007FEA7A30DB00) x(2) 0 
00007FEA7A30D9B0: allocate_tt 3 3 
00007FEA7A30D9C0: move2_xyxy x(2) y(0) x(1) y(1) 
00007FEA7A30D9D0: move_ry x(0) y(2) 
00007FEA7A30D9E0: i_call_fun_I 1 
00007FEA7A30D9F0: move_yx y(1) x(1) 
00007FEA7A30DA00: move_yr y(2) x(0) 
00007FEA7A30DA10: i_call_fun_I 1 
00007FEA7A30DA20: move_yx y(1) x(1) 
00007FEA7A30DA30: move_yr y(2) x(0) 
00007FEA7A30DA40: i_call_fun_I 1 
00007FEA7A30DA50: move_yx y(1) x(1) 
00007FEA7A30DA60: move_yr y(2) x(0) 
00007FEA7A30DA70: i_call_fun_I 1 
00007FEA7A30DA80: move_yx y(1) x(1) 
00007FEA7A30DA90: move_yr y(2) x(0) 
00007FEA7A30DAA0: i_call_fun_I 1 
00007FEA7A30DAB0: i_increment_yIId y(0) -1 0 x(2) 
00007FEA7A30DAD8: move_yx y(1) x(1) 
00007FEA7A30DAE8: move_call_last_yrfQ y(2) x(0) main:do_run_n/3 3 
00007FEA7A30DB00: move_return_cr ok x(0) 

It should be quite good for small fast functions, but might be problematic for slow ones