Possible optimization: SIMD unrolled loops
jakobnissen opened this issue · 1 comments
SIMD is crucial for high-performance computing and often give speedups of 2x or more. Unfortunately, adding SIMD capabilities to Automa is quite tricky. This is because Automa's generated code emulate a finite state machine, which does a state transition on every individual byte. It is therefore inherently serial.
There is one exception. With a GOTO-generated FSM, when using a transitioning state through a edge that is a self-loop and where there are no actions, the machine literally does nothing except advance the pointer location until the read byte is no longer part of that edge's ByteSet
. Automa already special-cases this scenario, and create code with generate_unrolled_loop
here. This should be able to be vectorized.
We can load 8 bytes at a time from the vector, let it be v = NTuple{8, UInt8}
, and check if all 8 bytes lead to the same state transition (i.e, are contained in the ByteSet
).
Case 1: The `ByteSet` has a single member. Let's say the member is `0x08`:
w(v) = reduce(&, ntuple(i -> v[i] == 0x08, 8))
produce
.section __TEXT,__text,regular,pure_instructions
vpmovzxbw (%rdi), %xmm0 ## xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
movabsq $5446708368, %rax ## imm = 0x144A62C90
vpcmpeqw (%rax), %xmm0, %xmm0
vpacksswb %xmm0, %xmm0, %xmm0
vpmovmskb %xmm0, %eax
cmpb $-1, %al
sete %al
retq
nopw %cs:(%rax,%rax)
nopl (%rax,%rax)
Although this case is pretty rare, it's well vectorized.
Case 2: The `ByteSet`s extremes are separated by no more than 63 elements.
We compute a bitset in a 64-bit integer, let that be e.g. `0x00000000146aeac9`. It'll be known at compile time. We can then do `reduce(&, ntuple(i -> 0x00000000146aeac9 >>> ((v[i] - 11) & 63) & UInt(1) === UInt(1), 8))` and obtain .section __TEXT,__text,regular,pure_instructions
vpmovzxbw (%rdi), %xmm0 ## xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
movabsq $5446714096, %rax ## imm = 0x144A642F0
vpaddw (%rax), %xmm0, %xmm0
movabsq $5446714112, %rax ## imm = 0x144A64300
vpand (%rax), %xmm0, %xmm0
vpackuswb %xmm0, %xmm0, %xmm1
vpmovzxbq %xmm1, %ymm1 ## ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
movabsq $5446714128, %rax ## imm = 0x144A64310
vpshufb (%rax), %xmm0, %xmm0
vpmovzxbq %xmm0, %ymm0 ## ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
movabsq $5446714088, %rax ## imm = 0x144A642E8
vpbroadcastq (%rax), %ymm2
vpsrlvq %ymm0, %ymm2, %ymm0
vpsrlvq %ymm1, %ymm2, %ymm1
vpand %ymm0, %ymm1, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpand %xmm1, %xmm0, %xmm0
vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
vpand %xmm1, %xmm0, %xmm0
vpextrb $0, %xmm0, %eax
andb $1, %al
vzeroupper
retq
nopw (%rax,%rax)
We can also use this if the inverted ByteSet
's extreme are 64 apart or closer. Then we just reduce with |
and use the bitinverted integer.
Case 3: The general case
This is a bit harder harder. However, with a few helper functions:@inline baz(x::UInt64, b::Integer) = ((x >>> (b&63)) & UInt(1)) === UInt(1)
@inline function bar(b::UInt8, bs::NTuple{4, UInt64})
ifelse(b > 0x7f,
ifelse(b > 0xbf,
baz(bs[1], b - UInt8(192)),
baz(bs[2], b - UInt8(128))
),
ifelse(b > 0x3f,
baz(bs[3], b - UInt8(64)),
baz(bs[4], b - UInt8(0))
)
)
end
We can now do
@inline function w(v)
r = true
@inbounds for i in 1:8
r &= bar(v[i], (0x7b651ac77b723d2a, 0x09a4d04a8b9e88bb, 0x0d3be793e8ec2ab4, 0x52f036d61fe3b7f9))
end
r
end
and obtain
.section __TEXT,__text,regular,pure_instructions
vpmovzxbw (%rdi), %xmm1 ## xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
movabsq $5446737520, %rax ## imm = 0x144A69E70
vpand (%rax), %xmm1, %xmm2
vpsllw $8, %xmm1, %xmm0
vpsraw $8, %xmm0, %xmm0
vpcmpeqd %xmm3, %xmm3, %xmm3
vpcmpgtw %xmm3, %xmm0, %xmm0
vpshufd $78, %xmm0, %xmm3 ## xmm3 = xmm0[2,3,0,1]
vpmovsxwq %xmm3, %ymm3
vpmovsxwq %xmm0, %ymm0
movabsq $5446737536, %rax ## imm = 0x144A69E80
vmovdqa (%rax), %xmm4
vpcmpgtw %xmm2, %xmm4, %xmm4
vpmovsxwq %xmm4, %ymm5
vpshufd $78, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,0,1]
vpmovsxwq %xmm4, %ymm4
movabsq $5446737552, %rax ## imm = 0x144A69E90
vpand (%rax), %xmm1, %xmm1
movabsq $5446737568, %rax ## imm = 0x144A69EA0
vpshufb (%rax), %xmm1, %xmm6
vpmovzxbq %xmm6, %ymm6 ## ymm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero,xmm6[2],zero,zero,zero,zero,zero,zero,zero,xmm6[3],zero,zero,zero,zero,zero,zero,zero
vpackuswb %xmm0, %xmm1, %xmm1
movabsq $5446737472, %rax ## imm = 0x144A69E40
vbroadcastsd (%rax), %ymm7
movabsq $5446737480, %rax ## imm = 0x144A69E48
vbroadcastsd (%rax), %ymm8
vblendvpd %ymm4, %ymm7, %ymm8, %ymm4
vblendvpd %ymm5, %ymm7, %ymm8, %ymm5
movabsq $5446737584, %rax ## imm = 0x144A69EB0
vmovdqa (%rax), %xmm7
vpcmpgtw %xmm2, %xmm7, %xmm2
vpmovsxwq %xmm2, %ymm7
vpshufd $78, %xmm2, %xmm2 ## xmm2 = xmm2[2,3,0,1]
vpmovsxwq %xmm2, %ymm2
movabsq $5446737488, %rax ## imm = 0x144A69E50
vbroadcastsd (%rax), %ymm8
movabsq $5446737496, %rax ## imm = 0x144A69E58
vbroadcastsd (%rax), %ymm9
vblendvpd %ymm2, %ymm8, %ymm9, %ymm2
vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
vblendvpd %ymm7, %ymm8, %ymm9, %ymm3
vblendvpd %ymm0, %ymm3, %ymm5, %ymm0
vpmovzxbq %xmm1, %ymm1 ## ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
movabsq $5446737504, %rax ## imm = 0x144A69E60
vpbroadcastq (%rax), %ymm3
vpsllvq %ymm1, %ymm3, %ymm1
vandpd %ymm1, %ymm0, %ymm0
vpsllvq %ymm6, %ymm3, %ymm1
vandpd %ymm1, %ymm2, %ymm1
vxorpd %xmm2, %xmm2, %xmm2
vpcmpeqq %ymm2, %ymm0, %ymm0
vpcmpeqd %ymm3, %ymm3, %ymm3
vpxor %ymm3, %ymm0, %ymm0
vpcmpeqq %ymm2, %ymm1, %ymm1
vpxor %ymm3, %ymm1, %ymm1
vpackssdw %ymm1, %ymm0, %ymm0
vpermq $216, %ymm0, %ymm0 ## ymm0 = ymm0[0,2,1,3]
vmovmskps %ymm0, %eax
cmpb $-1, %al
sete %al
vzeroupper
retq
nopw %cs:(%rax,%rax)
nopl (%rax)
Note that while this is a lot of code, this general case currently creates nested branches for every loop unroll for each of the 8 bytes, so it will presumably still be many times faster.
Okay, so I accidentally stumbled across an ingenious use of the vpshufb
instruction. This is present on all x86 chips since about 2006 (it's part of the SSSE3 instruction set). It allows you to construct vectorized lookup tables. See also this link, where the author discusses their use to check for membership in byte sets.
With this, we could do much better. With this we can process 16 bytes per iteration. Modern CPUs with AVX2 can do 32, so we use that by default and fall back to SSSE3 if AVX2 is not available. I have made 13 different code generators for different cases.
Here is the SIMD loop of the worst, which covers all possible edges. We default to that
movq %rsi, %rax
vmovdqu -1(%rdx,%rsi), %ymm5
vpshufb %ymm5, %ymm0, %ymm6
vpxor %ymm1, %ymm5, %ymm7
vpshufb %ymm7, %ymm2, %ymm7
vpor %ymm6, %ymm7, %ymm6
vpsrlw $4, %ymm5, %ymm5
vpand %ymm3, %ymm5, %ymm5
vpshufb %ymm5, %ymm4, %ymm5
vpand %ymm6, %ymm5, %ymm5
vptest %ymm5, %ymm5
jne L157
leaq 32(%rax), %rsi
cmpq %rax, %rdi
jge L96
Here is the most typical SIMD loop for most SIMD-able edges
movq %rsi, %rax
vmovdqu -1(%rdx,%rsi), %ymm1
vpshufb %ymm1, %ymm0, %ymm2
vpxor %ymm1, %ymm2, %ymm1
vptest %ymm1, %ymm1
jne L66
leaq 32(%rax), %rsi
cmpq %rax, %rdi
jge L32
This is beautiful SIMD code. The worst of the SIMD produces code with chews through a 100 MB vector at 11 GB/s. For comparison, the fastest operation I can think of is reduce(⊻, x, init=0x00)
, which works at about 16 GB/s on my computer.
My work in progress is here: https://github.com/jakobnissen/simd_automa