Cannot build with SSE<4.1

I have access to a cluster with AMD processors. They support SSE2, SSE3 and SSE4A but not SSE 4.1:

$ bsub -I gcc -dM -E - -march=native </dev/null | grep -i sse
#define __SSE2_MATH__ 1
#define __SSE4A__ 1
#define __SSE_MATH__ 1
#define __SSE2__ 1
#define __SSE__ 1
#define __SSE3__ 1

$ bsub -I grep flags /proc/cpuinfo | uniq | tr ' ' '\n' | grep sse
<<Waiting for dispatch ...>>
<<Starting on cpt15>>
sse
sse2
sse4a
misalignsse

Building v0.99.7f with gcc 6.1.1 fails due to the availability of _mm_stream_load_si128 intrinsics on line 4667 here:

BURST/burst.c

Lines 4665 to 4670 in e1228f0

    
           DualCoil *RefSlide = RefClump[ri]; 
        
           for (uint32_t w = 0; w < ClumpLen[ri]; w += 2) { 
        
           	__m128i org = _mm_stream_load_si128((void*)(RefSlide++)); 
        
           	__m128i ex1 = _mm_and_si128(org,_mm_set1_epi8(0xF)); 
        
           	__m128i ex2 = _mm_and_si128(_mm_srli_epi16(org,4),_mm_set1_epi8(0xF)); 
        
           	_mm_store_si128((void*)(rclump+w),ex1);

Indeed:

$ bsub -I gcc burst.c -march=native -fwhole-program -fopenmp -Ofast
In file included from /software/lib/gcc/x86_64-redhat-linux/6.1.1/include/immintrin.h:37:0,
                 from burst.c:30:
burst.c: In function ‘do_alignments._omp_fn.57’:
/software/lib/gcc/x86_64-redhat-linux/6.1.1/include/smmintrin.h:582:1: error: inlining failed in call to always_inline ‘_mm_stream_load_si128’: target specific option mismatch
 _mm_stream_load_si128 (__m128i *__X)
 ^~~~~~~~~~~~~~~~~~~~~
burst.c:4667:14: note: called from here
      __m128i org = _mm_stream_load_si128((void*)(RefSlide++));
              ^~~
In file included from /software/lib/gcc/x86_64-redhat-linux/6.1.1/include/immintrin.h:37:0,
                 from burst.c:30:
/software/lib/gcc/x86_64-redhat-linux/6.1.1/include/smmintrin.h:582:1: error: inlining failed in call to always_inline ‘_mm_stream_load_si128’: target specific option mismatch
 _mm_stream_load_si128 (__m128i *__X)
 ^~~~~~~~~~~~~~~~~~~~~
burst.c:4667:14: note: called from here
      __m128i org = _mm_stream_load_si128((void*)(RefSlide++));
              ^~~

I am wondering if it's still possible to build with SSE<4.1 only with relatively minor changes? If not i guess it's worth removing the relevant sections at the top of burst.c:

BURST/burst.c

Lines 30 to 57 in e1228f0

    
           #include <immintrin.h> 
        
           #ifdef __AVX__ 
        
           	#define ASMVER "AVX-128" 
        
           #elif __SSE4_1__ 
        
           	#define ASMVER "SSE4.1" 
        
           #else 
        
           	#define _mm_popcnt_u64 __builtin_popcountll 
        
           	#define _mm_blendv_epi8(f,t,m) \ 
        
           		_mm_xor_si128(_mm_and_si128(_mm_xor_si128(f,t),m),f)  
        
           	#define _mm_blendv_ps(f,t,m) \ 
        
           		_mm_xor_ps(_mm_and_ps(_mm_xor_ps(f,t),m),f) 
        
           	#define _mm_cvtepu8_epi32(x) \ 
        
           		_mm_unpacklo_epi16(_mm_unpacklo_epi8(x,_mm_setzero_si128()),_mm_setzero_si128()) 
        
           	#define _mm_extract_epi8(a,x) \ 
        
           		(0xff & (_mm_extract_epi16(a,((x)>>1))>>(8*((x) & 0x01)))) 
        
           	#ifdef __SSSE3__ 
        
           		#define ASMVER "SSSE3" 
        
           	#else 
        
           		#define ASMVER "SSE2" 
        
           		#define _mm_cmple_epu16(x,y) \ 
        
           			_mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()) 
        
           		#define _mm_blendv_si128 (x, y, mask) \ 
        
           			_mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y)) 
        
           		#define _mm_min_epu16(x,y) \ 
        
           			_mm_blendv_si128(y, x, _mm_cmple_epu16(x, y)) 
        
           		#define _mm_lddqu_si128 _mm_loadu_si128 
        
           	#endif 
        
           #endif

I managed to build using -msse2 -mno-sse3 by providing an implementation for _mm_shuffle_epi8 discussed here.

I tried building an index with a very short test sequence

$ cat test.fa 
>REF
TTTGCTGTTCCTGCATGTAGTTTAAACGAGATTGCCAGCACCGGGTATCA
TTCACCATTTTTCTTTTCGTTAACTTGCCGTCAGCCTTTTCTTTGACCTC
TTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGACTTCCCGTGT
CCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT
CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCA
AGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG
TGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTT
AATACCACAACCAGGCATAGGGGAAAGATTGGAGGAAAGATGAGTGAGAG
CATCAACTTCTCTCACAACCTAGGCCAGTAAGTAGTGCTTGTGCTCATCT
CCTTGGCTGTGATACGTGGCCGGCCCTCGCTCCAGCAGCTGGACCCCTAC
CTGCCGTCTGCTGCCATCGGAGCCCAAAGCCGGGCTGTGACTGCTCAGAC
CAGCCGGCTGGAGGGAGGGGCTCAGCAGGTCTGGCTTTGGCCCTGGGAGA
GCAGGTGGAAGATCAGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTG
AGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG
TGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTT

and building the index works just fine:

$ bsub -Ip ./burst -r test.fa -a test.acx -o test.edx -d QUICK -i 0.97
Job <317060> is submitted to default queue <normal>.
<<Waiting for dispatch ...>>
<<Starting on cpt11>>
This is BURST [v0.99.7f]
 --> Using accelerator file test.acx
 --> Creating QUICK database (assuming max query length 500)
 --> Setting identity threshold to 0.970000
Using up to SSE2 with 1 threads.

Parsed 1 references.
Producing edb scaffold...
Avg. R Divergence: 0.000000 (0 dupes, 1 uniq)
There are 1 references and hence 0 clumps (+1)
Average R pack length = 750.000000
Writing database...
 --> neoEDB: Original pack table: 750 [now 375]
 --> neoEDB: Number read: 750, written: 375
Database written.
Generating accelerator 'test.acx'
Note: N-penalized accelerator not usable for unpenalized alignment
Finished scanning pass [0.534942].          
Number bad (initial) = 0
Filling accelerator [0 / 1]
Done scanning [0.001281]; writing...
Total accelerants stored: 650 (0 bad)
 --> [Re-Accel] Writing SMALL format acx...
Wrote accelerator (42.578336).

Alignment segfaults during the "Preparing SSE2 profiles" step

$ cat read.fa
>testread
GCAGGTGGAAGATCAGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTGAGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTT

$ ./burst -q read.fa -a test.acx -r test.edx -t 2 -o test.out
This is BURST [v0.99.7f]
 --> Using accelerator file test.acx
 --> Setting threads to 2
Using up to SSE2 with 2 threads.
 --> [Accel] Accelerator found. Parsing...
 --> [Accel] Total accelerants: 650 [bytes = 1950]
 --> [Accel] Reading 0 ambiguous entries

EDB database provided. Parsing...
 --> EDB: Fingerprints are DISABLED
 --> EDB: Parsing compressed headers
 --> EDB: 1 refs [1 orig], 1 clumps, 750 maxR
Parsed 1 queries (0.000802). Calculating minMax... 
Found min 150, max 150 (0.000003).
Converting queries... Converted (0.000002)
Copying queries... Copied (0.000002)
Sorting queries... Sorted (0.079578)
Copying indices... Copied (0.000003)
Determining uniqueness... Done (0.000002). Number unique: 1
Collecting unique sequences... Done (0.000002)
Creating data structures... Done (0.000002) [maxED: 4]
Determining query ambiguity... Determined (0.000005)
Creating bins... Created (0.000002); Unambig: 1, ambig: 0, super-ambig: 0 [0,1,1]
Calculating divergence... Calculated (0.000002) [1.000000 avg div; 0 max]
WARNING: program was built without SSSE3. This may be slow.
Preparing SSE2 profiles...
Segmentation fault (core dumped)
[ikolpako@frt burst]$

The segfault occurs one the line 1611 within the function create_sse2_profiles:

BURST/burst.c

Lines 1606 to 1612 in e1228f0

    
           for (uint32_t i = 0; i < numRclumps; ++i) { // for each ref 
        
           	for (uint32_t j = 0; j < ClumpLen[i]; ++j) // for each letter in the ref 
        
           		for (int k = 0; k < SCD; ++k) // for each possibility of query letter 
        
           			for (int z = 0; z < VECSZ; ++z) // against each parallel ref letter 
        
           				ProfClump[i][j*SCD+k].u8[z] =  
        
           					SCORENVedN[RefClump[i][j].u8[z]*SCD + k]; 
        
           }

After a bit of debugging I guessed that the expression RefClump[i][j].u8[z]*SCD + k is definitely too big to be a valid index for SCORENVedN. I tried replacing it with RefClump[i][j*SCD+k].u8[z] (by analogy with the LHS) and that results in the program terminating gracefully but producing no alignments in the output.

The SSE41 version also segfaults during alignment on an Intel machine supporting SSE41.

On the other hand, I tried to run the prebuilt binary on my old laptop which has AVX and it works like a charm. Looks like it might be not worth trying to make BURST run without AVX at all.

For what it's worth my branch that builds with SSE2 only is here https://github.com/ilya-kolpakov/BURST/tree/sse2

Do these binaries work? They're compiled with SSSE3 only.

burst_SSSE3.zip

The next version of the source should compile fine as I've added:
#define _mm_stream_load_si128 _mm_load_si128
after the existing code that says

#elif __SSE4_1__
	#define ASMVER "SSE4.1"
#else

Seems old versions of GCC didn't complain -- probably just did the swap automatically. Thanks for catching this!

	DualCoil *RefSlide = RefClump[ri];
	for (uint32_t w = 0; w < ClumpLen[ri]; w += 2) {
	__m128i org = _mm_stream_load_si128((void*)(RefSlide++));
	__m128i ex1 = _mm_and_si128(org,_mm_set1_epi8(0xF));
	__m128i ex2 = _mm_and_si128(_mm_srli_epi16(org,4),_mm_set1_epi8(0xF));
	_mm_store_si128((void*)(rclump+w),ex1);

	#include <immintrin.h>
	#ifdef __AVX__
	#define ASMVER "AVX-128"
	#elif __SSE4_1__
	#define ASMVER "SSE4.1"
	#else
	#define _mm_popcnt_u64 __builtin_popcountll
	#define _mm_blendv_epi8(f,t,m) \
	_mm_xor_si128(_mm_and_si128(_mm_xor_si128(f,t),m),f)
	#define _mm_blendv_ps(f,t,m) \
	_mm_xor_ps(_mm_and_ps(_mm_xor_ps(f,t),m),f)
	#define _mm_cvtepu8_epi32(x) \
	_mm_unpacklo_epi16(_mm_unpacklo_epi8(x,_mm_setzero_si128()),_mm_setzero_si128())
	#define _mm_extract_epi8(a,x) \
	(0xff & (_mm_extract_epi16(a,((x)>>1))>>(8*((x) & 0x01))))
	#ifdef __SSSE3__
	#define ASMVER "SSSE3"
	#else
	#define ASMVER "SSE2"
	#define _mm_cmple_epu16(x,y) \
	_mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128())
	#define _mm_blendv_si128 (x, y, mask) \
	_mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y))
	#define _mm_min_epu16(x,y) \
	_mm_blendv_si128(y, x, _mm_cmple_epu16(x, y))
	#define _mm_lddqu_si128 _mm_loadu_si128
	#endif
	#endif

	for (uint32_t i = 0; i < numRclumps; ++i) { // for each ref
	for (uint32_t j = 0; j < ClumpLen[i]; ++j) // for each letter in the ref
	for (int k = 0; k < SCD; ++k) // for each possibility of query letter
	for (int z = 0; z < VECSZ; ++z) // against each parallel ref letter
	ProfClump[i][j*SCD+k].u8[z] =
	SCORENVedN[RefClump[i][j].u8[z]*SCD + k];
	}