simdunpack doesn't decompress the values properly
akhld opened this issue · 12 comments
Hi
Here's a simple piece of code to reproduce the issue:
size_t N = 9999;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
for (k = 1; k < N; ++k){
datain[k] = k;
}
const uint32_t b = maxbits(datain);
simdpackwithoutmask(datain, buffer, b);
simdunpack(buffer, backbuffer, b);
for (k = 1; k < N; ++k){
printf ("%d\n", backbuffer[k]);
}
Not quiet sure if this is an issue with the buffer that I'm initializing, basically i need to compress an array of random Integers.
maxbits(datain) only checks 128 integers, but your datain array holds 9999 integers.
use maxbits_length instead, or simdmaxbitsd1_length (w/ initvalue set to 0).
Like this? const uint32_t b = maxbits_length(datain, N); It doesn't work for me.
simdpackwithoutmask() and simdunpack() also just pack/unpack 128 integers. Here is code which works for you:
size_t k, N = 9999;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
datain[k] = k;
}
b = maxbits_length(datain, N);
simdpack_length(datain, N, (__m128i *)buffer, b);
simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
printf ("%d\n", backbuffer[k]);
}
Awesome. Works like charm. Thanks a ton @cruppstahl
great :-)
Added to README:
Just a followup question, I read the simdpack_length function defenition and in the comments its specified as "slower" compared to the simdpackwithoutmask method. I tried to change the compress function in the example as follows:
size_t compress2(uint32_t * datain, size_t length, uint8_t * buffer) {
uint32_t offset;
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
offset = 0;
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, buffer, b);
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
And called it as follows:
size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k;
for(k=0;k<nn;++k){
datainn[k] = rand() % (k + 1);
}
size_t compsize = compress2(datainn,nn,buffern);
And tried to uncompress as follows, but it does not retrieve the values properly.
uint8_t * decbuffern = buffern;
for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = maxbits(datainn);
simdunpack(buffern, backbuffern, b);
decbuffern += b * sizeof(__m128i);
}
for (k = 0; k < nn; ++k){
printf ("%d\n", backbuffern[k]);
}
Could you also add an example for the same? I bench-marked the previous version with a java based implementation and I'm seeing ~20x higher performance.
I did not compile your code, but noticed a few things:
in compress2() you store the "maxbit" value in *buffer++ = b
, but when uncompressing you don't reuse the stored value. Also, compress2() does not increment the input pointer correctly. It should be like this:
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, buffer, b);
datain += SIMDBlockSize;
buffer += b * sizeof(__m128i);
}
and uncompress:
uint8_t * decbuffern = buffern;
for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = maxbits(*buffern);
buffern++;
simdunpack(buffern, backbuffern, b);
buffern += b * sizeof(__m128i);
backbuffern += SIMDBlockSize;
}
(I haven't tested this code.)
The *_length functions were indeed slow for long arrays, but I fixed that with code that looks like what @cruppstahl proposed...
e26e44f#diff-8e134dc3d7779826cc25abb6684670a7R14149
On the plus side, my code handles the case where the input data contains a number of integers that is not divisible by 128.
Of course, these functions may not be exactly what you are looking for but, hopefully, it should be "easy" to modify them to suit your needs.
thanks @lemire nice work.
@cruppstahl unfortunately, the changes in the uncompress is ending up with "Segmentation fault (core dumped)", i suspect its an index out of bounds?
The code here works:
size_t compress2(uint32_t * datain, size_t length, uint8_t * buffer) {
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, (__m128i *)buffer, b);
datain += SIMDBlockSize;
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
int main() {
size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k, compsize;
for(k=0;k<nn;++k){
datainn[k] = rand() % (k + 1);
}
compsize = compress2(datainn,nn,buffern);
printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
(unsigned)(nn * sizeof(uint32_t)));
for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = *buffern;
buffern++;
simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
buffern += b * sizeof(__m128i);
}
for (k = 0; k < nn; ++k){
printf ("%d\n", backbuffern[k]);
}
return 0;
}
Yep, it works. The issue was with the way we were moving the pointer in the unpack i guess.