lemire/simdcomp

simdunpack doesn't decompress the values properly

akhld opened this issue · 12 comments

akhld commented

Hi

Here's a simple piece of code to reproduce the issue:

size_t N = 9999;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));

for (k = 1; k < N; ++k){       
    datain[k] = k;
}

const uint32_t b = maxbits(datain);

simdpackwithoutmask(datain, buffer, b);

simdunpack(buffer, backbuffer, b);

for (k = 1; k < N; ++k){       
    printf ("%d\n", backbuffer[k]);
}

Not quiet sure if this is an issue with the buffer that I'm initializing, basically i need to compress an array of random Integers.

maxbits(datain) only checks 128 integers, but your datain array holds 9999 integers.
use maxbits_length instead, or simdmaxbitsd1_length (w/ initvalue set to 0).

akhld commented

Like this? const uint32_t b = maxbits_length(datain, N); It doesn't work for me.

simdpackwithoutmask() and simdunpack() also just pack/unpack 128 integers. Here is code which works for you:

  size_t k, N = 9999;
  uint32_t * datain = malloc(N * sizeof(uint32_t));
  uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
  uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
  uint32_t b;

  for (k = 0; k < N; ++k){        /* start with k=0, not k=1! */
      datain[k] = k;
  }

  b = maxbits_length(datain, N);
  simdpack_length(datain, N, (__m128i *)buffer, b);
  simdunpack_length((const __m128i *)buffer, N, backbuffer, b);

  for (k = 0; k < N; ++k){         /* start with k=0, not k=1! */
      printf ("%d\n", backbuffer[k]);
  }
akhld commented

Awesome. Works like charm. Thanks a ton @cruppstahl

great :-)

akhld commented

Just a followup question, I read the simdpack_length function defenition and in the comments its specified as "slower" compared to the simdpackwithoutmask method. I tried to change the compress function in the example as follows:

size_t compress2(uint32_t * datain, size_t length, uint8_t * buffer) {
    uint32_t offset;
    uint8_t * initout;
    size_t k;
    if(length/SIMDBlockSize*SIMDBlockSize != length) {
        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
    }
    offset = 0;
    initout = buffer;
    for(k = 0; k < length / SIMDBlockSize; ++k) {
        uint32_t b = maxbits(datain);
        *buffer++ = b;
        simdpackwithoutmask(datain, buffer, b);
        offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
        buffer += b * sizeof(__m128i);
    }
    return buffer - initout;
}

And called it as follows:

size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k;

for(k=0;k<nn;++k){               
    datainn[k] = rand() % (k + 1);
}

size_t compsize = compress2(datainn,nn,buffern);

And tried to uncompress as follows, but it does not retrieve the values properly.

uint8_t * decbuffern = buffern;
for (k = 0; k * SIMDBlockSize < nn; ++k) {
  uint32_t b = maxbits(datainn);      
  simdunpack(buffern, backbuffern, b);           
  decbuffern += b * sizeof(__m128i);      
}

for (k = 0; k < nn; ++k){       
    printf ("%d\n", backbuffern[k]);
}

Could you also add an example for the same? I bench-marked the previous version with a java based implementation and I'm seeing ~20x higher performance.

I did not compile your code, but noticed a few things:

in compress2() you store the "maxbit" value in *buffer++ = b, but when uncompressing you don't reuse the stored value. Also, compress2() does not increment the input pointer correctly. It should be like this:

for(k = 0; k < length / SIMDBlockSize; ++k) {
    uint32_t b = maxbits(datain);
    *buffer++ = b;
    simdpackwithoutmask(datain, buffer, b);
    datain += SIMDBlockSize;
    buffer += b * sizeof(__m128i);
}

and uncompress:

uint8_t * decbuffern = buffern;
for (k = 0; k * SIMDBlockSize < nn; ++k) {
  uint32_t b = maxbits(*buffern);      
  buffern++;
  simdunpack(buffern, backbuffern, b);           
  buffern += b * sizeof(__m128i);      
  backbuffern += SIMDBlockSize;
}

(I haven't tested this code.)

@akhld

The *_length functions were indeed slow for long arrays, but I fixed that with code that looks like what @cruppstahl proposed...

e26e44f#diff-8e134dc3d7779826cc25abb6684670a7R14149

On the plus side, my code handles the case where the input data contains a number of integers that is not divisible by 128.

Of course, these functions may not be exactly what you are looking for but, hopefully, it should be "easy" to modify them to suit your needs.

akhld commented

thanks @lemire nice work.

@cruppstahl unfortunately, the changes in the uncompress is ending up with "Segmentation fault (core dumped)", i suspect its an index out of bounds?

The code here works:

    size_t compress2(uint32_t * datain, size_t length, uint8_t * buffer) {
        uint8_t * initout;
        size_t k;
        if(length/SIMDBlockSize*SIMDBlockSize != length) {
            printf("Data length should be a multiple of %i \n",SIMDBlockSize);
        }
        initout = buffer;
        for(k = 0; k < length / SIMDBlockSize; ++k) {
            uint32_t b = maxbits(datain);
            *buffer++ = b;
            simdpackwithoutmask(datain, (__m128i *)buffer, b);
            datain += SIMDBlockSize;
            buffer += b * sizeof(__m128i);
        }
        return buffer - initout;
    }

    int main() {
      size_t nn = 128 * 2;
      uint32_t * datainn = malloc(nn * sizeof(uint32_t));
      uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
      uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
      size_t k, compsize;

      for(k=0;k<nn;++k){               
        datainn[k] = rand() % (k + 1);
      }

      compsize = compress2(datainn,nn,buffern);
      printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
                    (unsigned)(nn * sizeof(uint32_t)));

      for (k = 0; k * SIMDBlockSize < nn; ++k) {
        uint32_t b = *buffern;
        buffern++;
        simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
        buffern += b * sizeof(__m128i);      
      }

      for (k = 0; k < nn; ++k){       
          printf ("%d\n", backbuffern[k]);
      }

      return 0;
    }
akhld commented

Yep, it works. The issue was with the way we were moving the pointer in the unpack i guess.