func unaligned_load can be optimized
wangqiim opened this issue · 2 comments
robin-hood-hashing/src/include/robin_hood.h
Lines 354 to 361 in fb14836
I find template parameter T is only used as uint16_t
, uint32_t
and uint64_t
, so I think unaligned_load<int64_t>((int64_t *)s + i);
can be optimezer through *((int64_t *)s + i)
.
test.cpp
#include <xmmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <math.h>
#include <stdint.h>
#include <cstring>
template <typename T>
inline T unaligned_load(void const* ptr) noexcept {
// using memcpy so we don't get into unaligned load problems.
// compiler should optimize this very well anyways.
T t;
std::memcpy(&t, ptr, sizeof(T));
return t;
}
size_t hash_bytes(void const* ptr, size_t len) noexcept {
static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
static constexpr uint64_t seed = UINT64_C(0xe17a1465);
static constexpr unsigned int r = 47;
auto const* const data64 = static_cast<uint64_t const*>(ptr);
uint64_t h = seed ^ (len * m);
size_t const n_blocks = len / 8;
for (size_t i = 0; i < n_blocks; ++i) {
auto k = unaligned_load<uint64_t>(data64 + i);
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
switch (len & 7U) {
case 7:
h ^= static_cast<uint64_t>(data8[6]) << 48U;
case 6:
h ^= static_cast<uint64_t>(data8[5]) << 40U;
case 5:
h ^= static_cast<uint64_t>(data8[4]) << 32U;
case 4:
h ^= static_cast<uint64_t>(data8[3]) << 24U;
case 3:
h ^= static_cast<uint64_t>(data8[2]) << 16U;
case 2:
h ^= static_cast<uint64_t>(data8[1]) << 8U;
case 1:
h ^= static_cast<uint64_t>(data8[0]);
h *= m;
default:
break;
}
h ^= h >> r;
// not doing the final step here, because this will be done by keyToIdx anyways
// h *= m;
// h ^= h >> r;
return static_cast<size_t>(h);
}
char s[8000000];
int main() {
int n = 20;
for (int i = 0; i < 4000000; i++) {
s[i] = i;
}
struct timeval tv1, tv2, tv3;
gettimeofday(&tv1, NULL);
int sum = 0;
for (int j = 0; j < n; j++) {
for (int i = 0; i < 1000000; i++) {
sum += unaligned_load<int64_t>((int64_t *)s + i);
}
}
gettimeofday(&tv2, NULL);
tv2.tv_sec -= tv1.tv_sec;
tv2.tv_usec -= tv1.tv_usec;
printf("sum = %d, time cost: %d.%06d, \n", sum, tv2.tv_sec, tv2.tv_usec);
gettimeofday(&tv1, NULL);
sum = 0;
for (int j = 0; j < n; j++) {
for (int i = 0; i < 1000000; i++) {
sum += *((int64_t *)s + i);
}
}
gettimeofday(&tv3, NULL);
tv3.tv_sec -= tv1.tv_sec;
tv3.tv_usec -= tv1.tv_usec;
printf("sum = %d, time cost: %d.%06d, \n", sum, tv3.tv_sec, tv3.tv_usec);
}
output
sum = 1583703552, time cost: 0.067667,
sum = 1583703552, time cost: 0.043813,
I think your benchmark numbers are bogus. It optimizes to the same code: https://godbolt.org/z/bevvonoKf
And the more important issue is that on some architectures it is not allowed to do an unaligned load with something like *(int64_t *)s
. That will cause a SEGFAULT.
I think your benchmark numbers are bogus. It optimizes to the same code: https://godbolt.org/z/bevvonoKf
And the more important issue is that on some architectures it is not allowed to do an unaligned load with something like
*(int64_t *)s
. That will cause a SEGFAULT.
yes, you are right. I make a mistake. I ignored the impact of the CPU cache.:joy:
In for (int i = 0; i < 4000000; i++)
the numbers should be 8000000 instead of 4000000. (warm up cpu cache). Thanks for your code example in the tool (Compiler Explorer) , This tool is awesome.