Try to optimize runner's tight inner loop
seriyps opened this issue · 3 comments
rebar3_bench/src/rebar3_bench_runner.erl
Lines 129 to 134 in 03bb6ec
Should find if it's possible to somehow minimize the amount of work done by the loop.
On OTP22 seems changing the order of arguments doesn't reduce amount of opcodes in the function, but changes opcodes itself and amount of arguments to those opcodes:
do_run_n(_, _, 0) ->
ok;
do_run_n(St, F, N) ->
F(St),
do_run_n(St, F, N - 1).
{function, do_run_n, 3, 4}.
{label,3}.
{line,[{location,"main.erl",14}]}.
{func_info,{atom,main},{atom,do_run_n},3}.
{label,4}.
{'%',{type_info,{x,2},number}}.
{test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
{move,{atom,ok},{x,0}}.
return.
{label,5}.
{allocate,3,3}.
{move,{x,2},{y,0}}.
{move,{x,1},{y,1}}.
{move,{x,0},{y,2}}.
{move,{y,1},{x,1}}.
{line,[{location,"main.erl",17}]}.
{call_fun,1}.
{line,[{location,"main.erl",18}]}.
{gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
{move,{y,1},{x,1}}.
{move,{y,2},{x,0}}.
{call_last,3,{f,4},3}.
00007F6EA37CB398: i_func_info_IaaI 0 main do_run_n 3
00007F6EA37CB3C0: i_is_eq_exact_immed_fxc f(00007F6EA37CB3E8) x(2) 0
00007F6EA37CB3D8: move_return_c ok
00007F6EA37CB3E8: allocate_tt 3 3
00007F6EA37CB3F8: move_window3_xxxy x(2) x(1) x(0) y(0)
00007F6EA37CB408: move_yx y(1) x(1)
00007F6EA37CB418: i_call_fun_t 1
00007F6EA37CB428: i_increment_yWd y(0) -1 x(2)
00007F6EA37CB448: move_src_window2_yxx y(1) x(1) x(0)
00007F6EA37CB458: i_call_last_fQ main:do_run_n/3 3
( 10 opcodes, `00007F6EA37CB408: move_yx y(1) x(1) ` touches 2 registers)
========================================
do_run_n(_, _, 0) ->
ok;
do_run_n(F, St, N) ->
F(St),
do_run_n(F, St, N - 1).
{function, do_run_n, 3, 4}.
{label,3}.
{line,[{location,"main.erl",14}]}.
{func_info,{atom,main},{atom,do_run_n},3}.
{label,4}.
{'%',{type_info,{x,2},number}}.
{test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
{move,{atom,ok},{x,0}}.
return.
{label,5}.
{allocate,3,3}.
{move,{x,2},{y,0}}.
{move,{x,1},{y,1}}.
{move,{x,0},{y,2}}.
{move,{x,1},{x,0}}.
{move,{y,2},{x,1}}.
{line,[{location,"main.erl",17}]}.
{call_fun,1}.
{line,[{location,"main.erl",18}]}.
{gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
{move,{y,1},{x,1}}.
{move,{y,2},{x,0}}.
{call_last,3,{f,4},3}.
00007F64D740B3A0: i_func_info_IaaI 0 main do_run_n 3
00007F64D740B3C8: i_is_eq_exact_immed_fxc f(00007F64D740B3F0) x(2) 0
00007F64D740B3E0: move_return_c ok
00007F64D740B3F0: allocate_tt 3 3
00007F64D740B400: move_window3_xxxy x(2) x(1) x(0) y(0)
00007F64D740B410: move_shift_yxx y(2) x(1) x(0)
00007F64D740B420: i_call_fun_t 1
00007F64D740B430: i_increment_yWd y(0) -1 x(2)
00007F64D740B450: move_src_window2_yxx y(1) x(1) x(0)
00007F64D740B460: i_call_last_fQ main:do_run_n/3 3
( 10 opcodes, `00007F64D740B410: move_shift_yxx y(2) x(1) x(0) ` touches 3 registers)
==========================================
do_run_n(0, _, _) ->
ok;
do_run_n(N, F, St) ->
F(St),
do_run_n(N - 1, F, St).
00007F62A078B3A0: i_func_info_IaaI 0 main do_run_n 3
00007F62A078B3C8: i_is_eq_exact_immed_frc f(00007F62A078B3F0) r(0) 0
00007F62A078B3E0: move_return_c ok
00007F62A078B3F0: allocate_tt 3 3
00007F62A078B400: move_window3_xxxy x(2) x(1) x(0) y(0)
00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0)
00007F62A078B420: i_call_fun_t 1
00007F62A078B430: i_increment_yWd y(2) -1 x(0)
00007F62A078B450: move_src_window2_yxx y(0) x(2) x(1)
00007F62A078B460: i_call_last_fQ main:do_run_n/3 3
( 10 opcodes, `00007F62A078B410: move2_par_yxxx y(1) x(1) x(2) x(0) ` touches 4 registers)
So, the current order of arguments is the optimal one
It might be problematic that less possible clause N == 0 -> ok
is executed first. Would be nice to make main body execute first and jump to -> ok
if guard fails.
do_run_n(St, F, N) when N =/= 0 ->
F(St),
do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
ok.
{function, do_run_n, 3, 4}.
{label,3}.
{line,[{location,"main.erl",14}]}.
{func_info,{atom,main},{atom,do_run_n},3}.
{label,4}.
{'%',{type_info,{x,2},number}}.
{test,is_eq_exact,{f,5},[{x,2},{integer,0}]}.
{move,{atom,ok},{x,0}}.
return.
{label,5}.
{allocate,3,3}.
{move,{x,2},{y,0}}.
{move,{x,1},{y,1}}.
{move,{x,0},{y,2}}.
{move,{y,1},{x,1}}.
{line,[{location,"main.erl",15}]}.
{call_fun,1}.
{line,[{location,"main.erl",16}]}.
{gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
{move,{y,1},{x,1}}.
{move,{y,2},{x,0}}.
{call_last,3,{f,4},3}.
00007FBF13D0B398: i_func_info_IaaI 0 main do_run_n 3
00007FBF13D0B3C0: i_is_eq_exact_immed_fxc f(00007FBF13D0B3E8) x(2) 0
00007FBF13D0B3D8: move_return_c ok
00007FBF13D0B3E8: allocate_tt 3 3
00007FBF13D0B3F8: move_window3_xxxy x(2) x(1) x(0) y(0)
00007FBF13D0B408: move_yx y(1) x(1)
00007FBF13D0B418: i_call_fun_t 1
00007FBF13D0B428: i_increment_yWd y(0) -1 x(2)
00007FBF13D0B448: move_src_window2_yxx y(1) x(1) x(0)
00007FBF13D0B458: i_call_last_fQ main:do_run_n/3 3
So, putting "main" body as 1st clause with N =/= 0
doesn't make any difference on OTP-22. Compiler converts it to the same binary code as original.
do_run_n(St, F, N) when N > 0 ->
F(St),
do_run_n(St, F, N - 1);
do_run_n(_, _, N) ->
ok.
{function, do_run_n, 3, 4}.
{label,3}.
{line,[{location,"main.erl",14}]}.
{func_info,{atom,main},{atom,do_run_n},3}.
{label,4}.
{test,is_lt,{f,5},[{integer,0},{x,2}]}.
{allocate,3,3}.
{move,{x,2},{y,0}}.
{move,{x,1},{y,1}}.
{move,{x,0},{y,2}}.
{move,{y,1},{x,1}}.
{line,[{location,"main.erl",15}]}.
{call_fun,1}.
{line,[{location,"main.erl",16}]}.
{gc_bif,'-',{f,0},0,[{y,0},{integer,1}],{x,2}}.
{move,{y,1},{x,1}}.
{move,{y,2},{x,0}}.
{call_last,3,{f,4},3}.
{label,5}.
{move,{atom,ok},{x,0}}.
return.
00007F6573B0B3D0: i_func_info_IaaI 0 main do_run_n 3
00007F6573B0B3F8: is_lt_fcx f(00007F6573B0B490) 0 x(2)
00007F6573B0B410: allocate_tt 3 3
00007F6573B0B420: move_window3_xxxy x(2) x(1) x(0) y(0)
00007F6573B0B430: move_yx y(1) x(1)
00007F6573B0B440: i_call_fun_t 1
00007F6573B0B450: i_increment_yWd y(0) -1 x(2)
00007F6573B0B470: move_src_window2_yxx y(1) x(1) x(0)
00007F6573B0B480: i_call_last_fQ main:do_run_n/3 3
00007F6573B0B490: move_return_c ok
http://tryerl.seriyps.ru/#id=78b4
So, if we put N > 0
as 1st clause, it will be executed first. But I don't know if is_lt_fcx
(<
) is noticeably slower than i_is_eq_exact_immed_frc
(=:=
)
Last option to consider is to run F
multiple times on each iteration loop (N should be div 5
from original N):
do_run_n(St, F, N) when N =/= 0 ->
F(St),
F(St),
F(St),
F(St),
F(St),
do_run_n(St, F, N - 1);
do_run_n(_, _, 0) ->
ok.
00007FEA7A30D968: i_func_info_IaaI 0 main do_run_n 3
00007FEA7A30D990: i_is_ne_exact_immed_fxc f(00007FEA7A30DB00) x(2) 0
00007FEA7A30D9B0: allocate_tt 3 3
00007FEA7A30D9C0: move2_xyxy x(2) y(0) x(1) y(1)
00007FEA7A30D9D0: move_ry x(0) y(2)
00007FEA7A30D9E0: i_call_fun_I 1
00007FEA7A30D9F0: move_yx y(1) x(1)
00007FEA7A30DA00: move_yr y(2) x(0)
00007FEA7A30DA10: i_call_fun_I 1
00007FEA7A30DA20: move_yx y(1) x(1)
00007FEA7A30DA30: move_yr y(2) x(0)
00007FEA7A30DA40: i_call_fun_I 1
00007FEA7A30DA50: move_yx y(1) x(1)
00007FEA7A30DA60: move_yr y(2) x(0)
00007FEA7A30DA70: i_call_fun_I 1
00007FEA7A30DA80: move_yx y(1) x(1)
00007FEA7A30DA90: move_yr y(2) x(0)
00007FEA7A30DAA0: i_call_fun_I 1
00007FEA7A30DAB0: i_increment_yIId y(0) -1 0 x(2)
00007FEA7A30DAD8: move_yx y(1) x(1)
00007FEA7A30DAE8: move_call_last_yrfQ y(2) x(0) main:do_run_n/3 3
00007FEA7A30DB00: move_return_cr ok x(0)
It should be quite good for small fast functions, but might be problematic for slow ones