Overhead due to unneeded __error assignments
Opened this issue · 2 comments
Hope that's a valid test. I'm not sure why there's all the redundant (*__self).__error = __error
assignments, but it doesn't come for free. I took the generated C++ code of the following example (spicyc -D jit -T
and copying away the cc files and commands) and patched out the obvious ones that seemed superflous. This results in a 10-12% raw parsing performance improvement for this microbenchmark. (Debian 12, GCC 12.2).
module Test;
type K = unit {
x: skip bytes &size=1;
};
type L = unit {
k: K;
};
type M = unit {
l: L;
};
public type N = unit {
: M[] &eod;
};
Command | Mean [s] | Min [s] | Max [s] | Relative |
---|---|---|---|---|
cat ./test-data.txt | spicy-driver Test.hlto |
6.011 ± 0.109 | 5.875 | 6.177 | 1.11 ± 0.02 |
cat ./test-data.txt | spicy-driver Test3.hlto |
5.413 ± 0.051 | 5.375 | 5.497 | 1.00 |
test-data.txt was generated as follows:
python3 -c 'print("x" * 5000000, end="")' > test-data.txt
There might some performance improvements by removing the unneeded assignments (or somehow convincing the compiler to not emit code for them).
Patch:
--- Test.cc 2023-11-10 15:07:04.152724653 +0100
+++ Test3.cc 2023-11-10 15:49:59.353374911 +0100
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::debug::dedent(std::string("spicy"));
__result = std::make_tuple(__cur, __lah, __lahe, __error);
return __result;
@@ -275,7 +275,7 @@
(*__self).__error = __error;
__location__("nested.spicy:7:10-9:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
__result = (*__self).__parse_Test_L_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
@@ -294,7 +294,7 @@
(*__self).__error = __error;
__location__("nested.spicy:7:10-9:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
return __result;
}
@@ -313,11 +313,11 @@
(*__self).__error = __error;
__location__("nested.spicy:12:3");
(void());
- __error = (*__self).__error;
- (*__self).__error = __error;
+ // __error = (*__self).__error;
+ // (*__self).__error = __error;
__location__("nested.spicy:11:10-13:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::debug::dedent(std::string("spicy"));
__result = std::make_tuple(__cur, __lah, __lahe, __error);
return __result;
@@ -334,7 +334,7 @@
(*__self).__error = __error;
__location__("nested.spicy:11:10-13:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
__result = (*__self).__parse_Test_M_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
@@ -353,7 +353,7 @@
(*__self).__error = __error;
__location__("nested.spicy:11:10-13:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
return __result;
}
@@ -374,7 +374,7 @@
(*__self).__error = __error;
__location__("nested.spicy:15:17-17:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::debug::dedent(std::string("spicy"));
__result = std::make_tuple(__cur, __lah, __lahe, __error);
return __result;
@@ -426,7 +426,7 @@
(*__self).__error = __error;
__location__("nested.spicy:15:17-17:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
__result = (*__self).__parse_Test_N_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
@@ -445,7 +445,7 @@
(*__self).__error = __error;
__location__("nested.spicy:15:17-17:2");
(void());
- __error = (*__self).__error;
+ // __error = (*__self).__error;
return __result;
}
We have #1444 for removing dead stores but didn't have a benchmark showing it was useful. Would you be open to compare topic/bbannier/dead_stores against the baseline ec87b43?
As discussed in DM, might be more reasonable to test a rebased version. There's also nothing fancy around the benchmark. The reproducer is essentially testing how quickly Spicy can discard one byte at a time within a nested unit structure.