Performance on Apple M1
mitghi opened this issue · 2 comments
Hello, I have been testing the code from this blog on Apple M1 using sse2neon library. However I have encountered a problem with low performance. I am not exactly sure why the result is opposite of the original post on x86_64 in terms of performance as I am a beginner and learning SIMD.
The following program loads an text file with 1.6 Gb size and counts the commas.
#include <stdio.h>
#include <assert.h>
#include <strings.h>
#include <stdlib.h>
#include <sys/time.h>
#include "simd.h"
// count_chars_8 is copied from Josh Weinstein's Blog
size_t
count_chars_8(const char* data, size_t size, const char ch)
{
size_t total = 0;
while (size) {
if (*data == ch)
total += 1;
data += 1;
size -= 1;
}
return total;
};
// count_chars_128 is copied from Josh Weinstein's Blog
size_t
count_chars_128(const char* data, size_t size, const char ch)
{
size_t total = 0;
assert(size % 16 == 0);
__m128i tocmp = _mm_set1_epi8(ch);
while (size) {
int mask = 0;
__m128i chunk = _mm_load_si128 ((__m128i const*)data);
__m128i results = _mm_cmpeq_epi8(chunk, tocmp);
mask = _mm_movemask_epi8(results);
total += _mm_popcnt_u32(mask);
data += 16;
size -= 16;
}
return total;
};
void
simd(char *string, size_t ln)
{
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
size_t result = count_chars_128(string, ln, ',');
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("[x] SIMD took %llu u/s\ncount: %d\n", delta_us, result);
};
void
normal(char *string)
{
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
size_t result = count_chars_8(string, strlen(string), ',');
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("[x] NORMAL took %llu u/s\ncount: %d\n", delta_us, result);
};
int
main()
{
FILE *f = fopen("/tmp/big.txt", "rb"); // 1.6 Gb file containing repeated "tes,tes,tes, ..." string
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
fseek(f, 0, SEEK_SET);
char *string = malloc(fsize + 1);
fread(string, fsize, 1, f);
fclose(f);
string[fsize] = 0;
puts("[+] File loaded");
normal(string);
simd(string, fsize); // <- usually 5-6 times slower than non-simd version
free(string);
return 0;
};
Here is the result I get:
MBPMIL:simd mitghi$ ./main
[+] File loaded
[x] NORMAL took 2165755 u/s
count: 500000000
[x] SIMD took 7884030 u/s
count: 500000000
The SIMD version is most of the times 5-6 times slower than the non-SIMD version which is the opposite of benchmark on x86_64.
Could you please provide some insight and assistance?
Thanks
Since SSE2NEON
is just a translator from SSE intrinsics to NEON counterpart, it might not be worthy to take SSE tweaked routines for IO-bound scenario such as count_chars_128
. Instead, you should check some SWAR/NEON implementations. For example, check this: https://github.com/magurosan/strlen_neon