uploadcare/pillow-simd

Segfault on ImagingResampleHorizontalConvolution8u4x

mratsim opened this issue · 7 comments

Segfault on ImagingResampleHorizontalConvolution8u4x (used by lutris).

Version: python-pillow-simd-git 6.0.x.post0.r0.g629f8d18-1 (https://github.com/uploadcare/pillow-simd/tree/629f8d188d0796140fe4a985a951b4d271defe30)

When launching lutris 0.5.3 with Pillow-simd instead of Pillow, Lutris core dumps. COnfirmed in 3 other separate reports:

Reproduction from lutris/lutris#2232:

gdb --args /usr/bin/python /usr/bin/lutris
[...]
Thread 1 "python" received signal SIGSEGV, Segmentation fault.
0x00007ffff3b1c16e in ImagingResampleHorizontalConvolution8u4x () from /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
 
(gdb) bt
#0  0x00007ffff3b1c16e in ImagingResampleHorizontalConvolution8u4x () at /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
#1  0x00007ffff3b1d01f in ImagingResampleHorizontal_8bpc () at /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
#2  0x00007ffff3b1d3f6 in ImagingResampleInner () at /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
#3  0x00007ffff3b1d643 in ImagingResample () at /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
#4  0x00007ffff3b15194 in  () at /usr/lib/python3.7/site-packages/PIL/_imaging.cpython-37m-x86_64-linux-gnu.so
[...]

I expected it's due to AVX2 alignment issue given the source code:

void
ImagingResampleHorizontalConvolution8u4x(
UINT32 *lineOut0, UINT32 *lineOut1, UINT32 *lineOut2, UINT32 *lineOut3,
UINT32 *lineIn0, UINT32 *lineIn1, UINT32 *lineIn2, UINT32 *lineIn3,
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision)
{
int xmin, xmax, xx, x;
INT16 *k;
for (xx = 0; xx < xsize; xx++) {
xmin = xbounds[xx * 2 + 0];
xmax = xbounds[xx * 2 + 1];
k = &kk[xx * kmax];
x = 0;
#if defined(__AVX2__)
{
__m256i sss0, sss1;
__m256i zero = _mm256_setzero_si256();
__m256i initial = _mm256_set1_epi32(1 << (coefs_precision-1));
sss0 = initial;
sss1 = initial;
for (; x < xmax - 3; x += 4) {
__m256i pix, mmk0, mmk1, source;
mmk0 = _mm256_set1_epi32(*(INT32 *) &k[x]);
mmk1 = _mm256_set1_epi32(*(INT32 *) &k[x + 2]);
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadu_si128((__m128i *) &lineIn0[x + xmin])),
_mm_loadu_si128((__m128i *) &lineIn1[x + xmin]), 1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadu_si128((__m128i *) &lineIn2[x + xmin])),
_mm_loadu_si128((__m128i *) &lineIn3[x + xmin]), 1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
}
for (; x < xmax - 1; x += 2) {
__m256i pix, mmk;
mmk = _mm256_set1_epi32(*(INT32 *) &k[x]);
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadl_epi64((__m128i *) &lineIn0[x + xmin])),
_mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]), 1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadl_epi64((__m128i *) &lineIn2[x + xmin])),
_mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]), 1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
}
for (; x < xmax; x ++) {
__m256i pix, mmk;
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
mmk = _mm256_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1);
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1);
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
}
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
sss0 = _mm256_packs_epi32(sss0, zero);
sss1 = _mm256_packs_epi32(sss1, zero);
sss0 = _mm256_packus_epi16(sss0, zero);
sss1 = _mm256_packus_epi16(sss1, zero);
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
}
#else
{
__m128i sss0, sss1, sss2, sss3;
__m128i initial = _mm_set1_epi32(1 << (coefs_precision-1));
sss0 = initial;
sss1 = initial;
sss2 = initial;
sss3 = initial;
for (; x < xmax - 3; x += 4) {
__m128i pix, mmk_lo, mmk_hi, source;
__m128i mask_lo = _mm_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
__m128i mask_hi = _mm_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8);
mmk_lo = _mm_set1_epi32(*(INT32 *) &k[x]);
mmk_hi = _mm_set1_epi32(*(INT32 *) &k[x + 2]);
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
source = _mm_loadu_si128((__m128i *) &lineIn0[x + xmin]);
// [16] a1 a0 b1 b0 g1 g0 r1 r0
pix = _mm_shuffle_epi8(source, mask_lo);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_lo));
// [16] a3 a2 b3 b2 g3 g2 r3 r2
pix = _mm_shuffle_epi8(source, mask_hi);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn2[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_hi));
}
for (; x < xmax - 1; x += 2) {
__m128i pix, mmk;
__m128i mask = _mm_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
// [16] k1 k0 k1 k0 k1 k0 k1 k0
mmk = _mm_set1_epi32(*(INT32 *) &k[x]);
// [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0
pix = _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin]);
// [16] a1 a0 b1 b0 g1 g0 r1 r0
pix = _mm_shuffle_epi8(pix, mask);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x ++) {
__m128i pix, mmk;
// [16] xx k0 xx k0 xx k0 xx k0
mmk = _mm_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
}
sss0 = _mm_srai_epi32(sss0, coefs_precision);
sss1 = _mm_srai_epi32(sss1, coefs_precision);
sss2 = _mm_srai_epi32(sss2, coefs_precision);
sss3 = _mm_srai_epi32(sss3, coefs_precision);
sss0 = _mm_packs_epi32(sss0, sss0);
sss1 = _mm_packs_epi32(sss1, sss1);
sss2 = _mm_packs_epi32(sss2, sss2);
sss3 = _mm_packs_epi32(sss3, sss3);
lineOut0[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss0, sss0));
lineOut1[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss1, sss1));
lineOut2[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss2, sss2));
lineOut3[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss3, sss3));
}
#endif
}
}

This happens to me, too. bt:

#0  ImagingResampleHorizontalConvolution8u4x (lineOut0=0x7fffd4434010, lineOut1=0x7fffd4434900, lineOut2=0x7fffd44351f0, lineOut3=lineOut3@entry=0x7fffd4435ae0, 
    lineIn0=0x7fffd4573010, lineIn1=0x7fffd4573ec0, lineIn2=0x7fffd4574d70, lineIn3=0x7fffd4575c20, xsize=572, xbounds=0x11d41d0, kk=0x1144d40, kmax=9, 
    coefs_precision=15) at src/libImaging/ResampleSIMDHorizontalConv.c:185
#1  0x00007ffff69eda63 in ImagingResampleHorizontal_8bpc (imOut=imOut@entry=0x12a7790, imIn=imIn@entry=0x12bc8a0, offset=offset@entry=0, ksize=ksize@entry=9, 
    bounds=0x11d41d0, prekk=0x1144d40) at src/libImaging/Resample.c:335
#2  0x00007ffff69ede45 in ImagingResampleInner (imIn=0x12bc8a0, xsize=572, ysize=347, filterp=0x7ffff6a2cfa0 <BICUBIC>, box=<optimized out>, 
    ResampleHorizontal=0x7ffff69ed840 <ImagingResampleHorizontal_8bpc>, ResampleVertical=0x7ffff69edaf0 <ImagingResampleVertical_8bpc>)
    at src/libImaging/Resample.c:608
#3  0x00007ffff69ee06a in ImagingResample (imIn=imIn@entry=0x12bc8a0, xsize=<optimized out>, ysize=<optimized out>, filter=<optimized out>, 
    box=box@entry=0x7fffffffd110) at src/libImaging/Resample.c:554
#4  0x00007ffff69e62f3 in _resize (self=<optimized out>, args=<optimized out>) at src/_imaging.c:1757

Pinging @homm

Issue popped up for me with Pillow-SIMD when python 2.7.15 got updated to python 2.7.17. Downgrading python to 2.7.15 fixed the issue for me.

homm commented

I'm implementing a fix but need more info.

Could you tell how exactly you are installing Pillow-SIMD? As I know this fault should happen only on builds without optimization, which are unusable since about 10 times slower. If it is really non-optimized builds, this is the real issue.

I used pip install on Ubuntu 19.10.

homm commented

Pillow-SIMD 6.0.0.post1 with a fix is released. Please, try it.
https://pypi.org/project/Pillow-SIMD/#history