|
void |
|
ImagingResampleHorizontalConvolution8u4x( |
|
UINT32 *lineOut0, UINT32 *lineOut1, UINT32 *lineOut2, UINT32 *lineOut3, |
|
UINT32 *lineIn0, UINT32 *lineIn1, UINT32 *lineIn2, UINT32 *lineIn3, |
|
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision) |
|
{ |
|
int xmin, xmax, xx, x; |
|
INT16 *k; |
|
|
|
for (xx = 0; xx < xsize; xx++) { |
|
xmin = xbounds[xx * 2 + 0]; |
|
xmax = xbounds[xx * 2 + 1]; |
|
k = &kk[xx * kmax]; |
|
x = 0; |
|
|
|
#if defined(__AVX2__) |
|
{ |
|
__m256i sss0, sss1; |
|
__m256i zero = _mm256_setzero_si256(); |
|
__m256i initial = _mm256_set1_epi32(1 << (coefs_precision-1)); |
|
sss0 = initial; |
|
sss1 = initial; |
|
|
|
for (; x < xmax - 3; x += 4) { |
|
__m256i pix, mmk0, mmk1, source; |
|
|
|
mmk0 = _mm256_set1_epi32(*(INT32 *) &k[x]); |
|
mmk1 = _mm256_set1_epi32(*(INT32 *) &k[x + 2]); |
|
|
|
source = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_loadu_si128((__m128i *) &lineIn0[x + xmin])), |
|
_mm_loadu_si128((__m128i *) &lineIn1[x + xmin]), 1); |
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); |
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0)); |
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( |
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, |
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); |
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1)); |
|
|
|
source = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_loadu_si128((__m128i *) &lineIn2[x + xmin])), |
|
_mm_loadu_si128((__m128i *) &lineIn3[x + xmin]), 1); |
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); |
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0)); |
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( |
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, |
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); |
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1)); |
|
} |
|
|
|
for (; x < xmax - 1; x += 2) { |
|
__m256i pix, mmk; |
|
|
|
mmk = _mm256_set1_epi32(*(INT32 *) &k[x]); |
|
|
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_loadl_epi64((__m128i *) &lineIn0[x + xmin])), |
|
_mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]), 1); |
|
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); |
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_loadl_epi64((__m128i *) &lineIn2[x + xmin])), |
|
_mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]), 1); |
|
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); |
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); |
|
} |
|
|
|
for (; x < xmax; x ++) { |
|
__m256i pix, mmk; |
|
|
|
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 |
|
mmk = _mm256_set1_epi32(k[x]); |
|
|
|
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0 |
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])), |
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1); |
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( |
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])), |
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1); |
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); |
|
} |
|
|
|
sss0 = _mm256_srai_epi32(sss0, coefs_precision); |
|
sss1 = _mm256_srai_epi32(sss1, coefs_precision); |
|
sss0 = _mm256_packs_epi32(sss0, zero); |
|
sss1 = _mm256_packs_epi32(sss1, zero); |
|
sss0 = _mm256_packus_epi16(sss0, zero); |
|
sss1 = _mm256_packus_epi16(sss1, zero); |
|
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0)); |
|
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); |
|
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0)); |
|
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); |
|
} |
|
#else |
|
{ |
|
__m128i sss0, sss1, sss2, sss3; |
|
__m128i initial = _mm_set1_epi32(1 << (coefs_precision-1)); |
|
sss0 = initial; |
|
sss1 = initial; |
|
sss2 = initial; |
|
sss3 = initial; |
|
|
|
for (; x < xmax - 3; x += 4) { |
|
__m128i pix, mmk_lo, mmk_hi, source; |
|
__m128i mask_lo = _mm_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0); |
|
__m128i mask_hi = _mm_set_epi8( |
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8); |
|
|
|
mmk_lo = _mm_set1_epi32(*(INT32 *) &k[x]); |
|
mmk_hi = _mm_set1_epi32(*(INT32 *) &k[x + 2]); |
|
|
|
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
|
source = _mm_loadu_si128((__m128i *) &lineIn0[x + xmin]); |
|
// [16] a1 a0 b1 b0 g1 g0 r1 r0 |
|
pix = _mm_shuffle_epi8(source, mask_lo); |
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_lo)); |
|
// [16] a3 a2 b3 b2 g3 g2 r3 r2 |
|
pix = _mm_shuffle_epi8(source, mask_hi); |
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_hi)); |
|
|
|
source = _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]); |
|
pix = _mm_shuffle_epi8(source, mask_lo); |
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_lo)); |
|
pix = _mm_shuffle_epi8(source, mask_hi); |
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_hi)); |
|
|
|
source = _mm_loadu_si128((__m128i *) &lineIn2[x + xmin]); |
|
pix = _mm_shuffle_epi8(source, mask_lo); |
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_lo)); |
|
pix = _mm_shuffle_epi8(source, mask_hi); |
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_hi)); |
|
|
|
source = _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]); |
|
pix = _mm_shuffle_epi8(source, mask_lo); |
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_lo)); |
|
pix = _mm_shuffle_epi8(source, mask_hi); |
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_hi)); |
|
} |
|
|
|
for (; x < xmax - 1; x += 2) { |
|
__m128i pix, mmk; |
|
__m128i mask = _mm_set_epi8( |
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0); |
|
|
|
// [16] k1 k0 k1 k0 k1 k0 k1 k0 |
|
mmk = _mm_set1_epi32(*(INT32 *) &k[x]); |
|
|
|
// [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0 |
|
pix = _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin]); |
|
// [16] a1 a0 b1 b0 g1 g0 r1 r0 |
|
pix = _mm_shuffle_epi8(pix, mask); |
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]); |
|
pix = _mm_shuffle_epi8(pix, mask); |
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin]); |
|
pix = _mm_shuffle_epi8(pix, mask); |
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]); |
|
pix = _mm_shuffle_epi8(pix, mask); |
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); |
|
} |
|
|
|
for (; x < xmax; x ++) { |
|
__m128i pix, mmk; |
|
// [16] xx k0 xx k0 xx k0 xx k0 |
|
mmk = _mm_set1_epi32(k[x]); |
|
// [16] xx a0 xx b0 xx g0 xx r0 |
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]); |
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]); |
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]); |
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); |
|
|
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]); |
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); |
|
} |
|
|
|
sss0 = _mm_srai_epi32(sss0, coefs_precision); |
|
sss1 = _mm_srai_epi32(sss1, coefs_precision); |
|
sss2 = _mm_srai_epi32(sss2, coefs_precision); |
|
sss3 = _mm_srai_epi32(sss3, coefs_precision); |
|
sss0 = _mm_packs_epi32(sss0, sss0); |
|
sss1 = _mm_packs_epi32(sss1, sss1); |
|
sss2 = _mm_packs_epi32(sss2, sss2); |
|
sss3 = _mm_packs_epi32(sss3, sss3); |
|
lineOut0[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss0, sss0)); |
|
lineOut1[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss1, sss1)); |
|
lineOut2[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss2, sss2)); |
|
lineOut3[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss3, sss3)); |
|
} |
|
#endif |
|
|
|
} |
|
} |