this expression results in infinite recursion
peabody-korg opened this issue · 2 comments
peabody-korg commented
xcode 9.2, SSE4_1
this function:
simdpp::uint32<4> foo3(simdpp::float32<4> a, simdpp::float32<4> b, simdpp::uint32<4> mask)
{
return ~mask | ((a == 0) & (b == 0));
}
produces an infinite recursion loop:
foo3(simdpp::arch_sse4p1::float32<4u, void>, simdpp::arch_sse4p1::float32<4u, void>, simdpp::arch_sse4p1::uint32<4u, void>):
0000000000000550 pushq %rbp
0000000000000551 movq %rsp, %rbp
0000000000000554 subq $0x40, %rsp
0000000000000558 pcmpeqd %xmm3, %xmm3
000000000000055c pxor %xmm2, %xmm3
0000000000000560 pxor %xmm2, %xmm2
0000000000000564 cmpeqps %xmm2, %xmm0
0000000000000568 movaps %xmm0, -0x20(%rbp)
000000000000056c cmpeqps %xmm2, %xmm1
0000000000000570 movaps %xmm1, -0x10(%rbp)
0000000000000574 movdqa %xmm3, -0x40(%rbp)
0000000000000579 movaps -0x10(%rbp), %xmm0
000000000000057d andps -0x20(%rbp), %xmm0
0000000000000581 movaps %xmm0, -0x30(%rbp)
0000000000000585 leaq -0x40(%rbp), %rdi
0000000000000589 leaq -0x30(%rbp), %rsi
000000000000058d callq simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&)
0000000000000592 addq $0x40, %rsp
0000000000000596 popq %rbp
0000000000000597 retq
0000000000000598 nopl
simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&):
00000000000005a0 pushq %rbp
00000000000005a1 movq %rsp, %rbp
00000000000005a4 popq %rbp
00000000000005a5 jmp simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&) ## simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&)
This can be worked around by changing mask to a simdpp::mask_float32<4> or bitcasting away the mask:
simdpp::bit_cast<simdpp::uint32<4>>((a == 0) & (b == 0))
peabody-korg commented
there's another case of this issue. this admittedly silly code
simdpp::mask_float32x4 foo2b(float a, simdpp::float32x4 b)
{
return (a <= -1000) & (b > 0);
}
produces:
Inspiration::foo2b(float, simdpp::arch_sse4p1::float32<4u, void>):
0000000000001af0 pushq %rbp
0000000000001af1 movq %rsp, %rbp
0000000000001af4 subq $0x30, %rsp
0000000000001af8 xorl %eax, %eax
0000000000001afa ucomiss 0x2df(%rip), %xmm0
0000000000001b01 setbe %al
0000000000001b04 xorps %xmm0, %xmm0
0000000000001b07 cmpltps %xmm1, %xmm0
0000000000001b0b movaps %xmm0, -0x20(%rbp)
0000000000001b0f movl %eax, -0x8(%rbp)
0000000000001b12 leaq -0x30(%rbp), %rdi
0000000000001b16 leaq -0x8(%rbp), %rsi
0000000000001b1a xorl %edx, %edx
0000000000001b1c callq void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int)
0000000000001b21 movaps -0x20(%rbp), %xmm0
0000000000001b25 andps -0x30(%rbp), %xmm0
0000000000001b29 addq $0x30, %rsp
0000000000001b2d popq %rbp
0000000000001b2e retq
0000000000001b2f nop
void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int):
0000000000001b30 pushq %rbp
0000000000001b31 movq %rsp, %rbp
0000000000001b34 popq %rbp
0000000000001b35 jmp void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int) ## void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, u> const&, unsigned int)
with a typed more correctly as a float32x4, this produces the expected code.
useless code this may be, but it really should only produce an invalid mask, not crash (unless crashing is within the scope of undefined behavior).