// Overview / Examples / API / FAQ
SWAR: SIMD Within A Register library
- Performance (branchless)
- Portable (uses 'normal' registers)
- Single header (https://raw.githubusercontent.com/qlibs/swar/main/swar - for integration see FAQ)
- Minimal API
- Verifies itself upon include (can be disabled with
-DNTEST
- see FAQ)
- C++20 (clang++13+, g++12)
constexpr u8 data[]{1, 2, 3, 5, 5, 6, 7, 8};
constexpr swar<u8> lhs{data}; // copy_from
constexpr swar<u8> rhs{5}; // broadcast (native: u64)
static_assert(8u == lhs.size());
static_assert(sizeof(u64) == sizeof(lhs));
constexpr auto match = lhs == rhs;
static_assert(any_of(match));
static_assert(some_of(match));
static_assert(not all_of(match));
static_assert(not none_of(match));
static_assert(3u == find_first_set(match));
static_assert(4u == find_last_set(match));
static_assert(2u == popcount(match));
static_assert(match[3u] and match[4u]);
static_assert(sizeof(u32) == sizeof(swar<u8, 4u>));
static_assert(sizeof(u64) == sizeof(swar<u8, 8u>));
static_assert(sizeof(u32) == sizeof(swar<u16, 2u>));
static_assert(sizeof(u64) == sizeof(swar<u16, 4u>));
static_assert(sizeof(u64) == sizeof(swar<u32, 2u>));
static_assert(sizeof(u128) == sizeof(swar<u32, 4u>));
// and more (see API)...
Performance (https://godbolt.org/z/ManGb8aso)
auto eq(swar<u8> lhs, swar<u8> rhs) {
return lhs == rhs;
}
eq: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rdx, -9187201950435737472
xor rdi, rsi
movabs rax, 72340172838076672
or rdi, rdx
sub rax, rdi
and rax, rdx
ret
auto contains(swar<u8> lhs, u8 value) {
const auto rhs = swar<u8>{value};
const auto match = lhs == rhs;
return any_of(match);
}
contains: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, -9187201950435737472
imul rsi, rax
sub rax, 1
xor rdi, rsi
or rdi, rdx
sub rax, rdi
test rax, rdx
setne al
ret
auto find(swar<u8> lhs, u8 value) {
const auto rhs = swar<u8>{value};
const auto match = lhs == rhs;
return any_of(match) * find_first_set(match);
}
find: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, 72340172838076672
imul rsi, rax
movabs rax, -9187201950435737472
xor rdi, rsi
or rdi, rax
sub rdx, rdi
and rdx, rax
xor eax, eax
rep bsf rax, rdx
test rdx, rdx
mov edx, 0
cmove rax, rdx
ret
swar vs simd (https://godbolt.org/z/YsG8evqr8)
template<class T> auto eq(T lhs, T rhs) { return lhs == rhs; }
eq(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rdx, -9187201950435737472
xor rdi, rsi
movabs rax, 72340172838076672
or rdi, rdx
sub rax, rdi
and rax, rdx
ret
eq(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
vpcmpeqb xmm0, xmm0, xmm1
ret
template<class T> auto contains(T lhs, auto value) {
const auto rhs = T{value};
const auto match = lhs == rhs;
return any_of(match);
}
cointains(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, -9187201950435737472
imul rsi, rax
sub rax, 1
xor rdi, rsi
or rdi, rdx
sub rax, rdi
test rax, rdx
setne al
ret
contains(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
vmovd xmm1, edi
vpbroadcastb xmm1, xmm1
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
setne al
ret
template<class T> auto find(T lhs, auto value) {
const auto rhs = T{value};
const auto match = lhs == rhs;
return any_of(match) * find_first_set(match);
}
find(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, 72340172838076672
imul rsi, rax
movabs rax, -9187201950435737472
xor rdi, rsi
or rdi, rax
sub rdx, rdi
and rdx, rax
xor eax, eax
rep bsf rax, rdx
test rdx, rdx
mov edx, 0
cmove rax, rdx
ret
find(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
vmovd xmm1, edi
vpbroadcastb xmm1, xmm1
vpcmpeqb xmm0, xmm1, xmm0
vpmovmskb eax, xmm0
or eax, 65536
rep bsf ecx, eax
xor eax, eax
vptest xmm0, xmm0
cmovne eax, ecx
ret
template<class T, size_t Width = sizeof(u64) / sizeof(T), class TAbi = abi_t<T, Width>>
requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar {
using value_type = T;
using abi_type = TAbi;
constexpr swar() noexcept = default;
constexpr swar(const swar&) noexcept = default;
constexpr swar(swar&&) noexcept = default;
constexpr explicit swar(const auto value) noexcept;
constexpr explicit swar(const auto* mem) noexcept;
constexpr explicit swar(const auto& gen) noexcept;
[[nodiscard]] constexpr explicit operator abi_type() const noexcept;
[[nodiscard]] constexpr auto operator[](size_t) const noexcept -> T;
[[nodiscard]] static constexpr auto size() noexcept -> size_t;
[[nodiscard]] friend constexpr auto operator==(const swar&, const swar&) noexcept;
};
template<class T, size_t Width = sizeof(u64) / sizeof(T), class TAbi = abi_t<T, Width>>
requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar_mask {
using value_type = bool; /// predefined
using abi_type = TAbi;
constexpr swar_mask() noexcept = default;
constexpr swar_mask(const swar_mask&) noexcept = default;
constexpr swar_mask(swar_mask&&) noexcept = default;
constexpr explicit swar_mask(const abi_type value) noexcept;
[[nodiscard]] constexpr auto operator[](const size_t index) const noexcept -> bool;
[[nodiscard]] static constexpr auto size() noexcept -> size_t { return Width; }
};
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto all_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto any_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto some_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto none_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto find_first_set(const swar_mask<T, Width, TAbi>& s) noexcept;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto find_last_set(const swar_mask<T, Width, TAbi>& s) noexcept;
template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto popcount(const swar_mask<T, Width, TAbi>& s) noexcept;
template<class T> inline constexpr bool is_swar_v = /* unspecified */;
template<class T> inline constexpr bool is_swar_mask_v = /* unspecified */;
-
How to disable running tests at compile-time?
When
-DNTEST
is defined static_asserts tests wont be executed upon include. Note: Use with caution as disabling tests means that there are no gurantees upon include that given compiler/env combination works as expected. -
How to integrate with CMake.FetchContent?
include(FetchContent) FetchContent_Declare( qlibs.swar GIT_REPOSITORY https://github.com/qlibs/swar GIT_TAG v1.0.0 ) FetchContent_MakeAvailable(qlibs.swar)
target_link_libraries(${PROJECT_NAME} PUBLIC qlibs.swar);
-
Acknowledgments
https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, https://wg21.link/P1928