p12tic/libsimdpp

this expression results in infinite recursion

peabody-korg opened this issue · 2 comments

xcode 9.2, SSE4_1

this function:

simdpp::uint32<4> foo3(simdpp::float32<4> a, simdpp::float32<4> b, simdpp::uint32<4> mask)
{
	return ~mask | ((a == 0) & (b == 0));
}

produces an infinite recursion loop:

foo3(simdpp::arch_sse4p1::float32<4u, void>, simdpp::arch_sse4p1::float32<4u, void>, simdpp::arch_sse4p1::uint32<4u, void>):
0000000000000550	pushq	%rbp
0000000000000551	movq	%rsp, %rbp
0000000000000554	subq	$0x40, %rsp
0000000000000558	pcmpeqd	%xmm3, %xmm3
000000000000055c	pxor	%xmm2, %xmm3
0000000000000560	pxor	%xmm2, %xmm2
0000000000000564	cmpeqps	%xmm2, %xmm0
0000000000000568	movaps	%xmm0, -0x20(%rbp)
000000000000056c	cmpeqps	%xmm2, %xmm1
0000000000000570	movaps	%xmm1, -0x10(%rbp)
0000000000000574	movdqa	%xmm3, -0x40(%rbp)
0000000000000579	movaps	-0x10(%rbp), %xmm0
000000000000057d	andps	-0x20(%rbp), %xmm0
0000000000000581	movaps	%xmm0, -0x30(%rbp)
0000000000000585	leaq	-0x40(%rbp), %rdi
0000000000000589	leaq	-0x30(%rbp), %rsi
000000000000058d	callq	simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&)
0000000000000592	addq	$0x40, %rsp
0000000000000596	popq	%rbp
0000000000000597	retq
0000000000000598	nopl

simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&):
00000000000005a0	pushq	%rbp
00000000000005a1	movq	%rsp, %rbp
00000000000005a4	popq	%rbp
00000000000005a5	jmp	simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&) ## simdpp::arch_sse4p1::uint32<4u, void> simdpp::arch_sse4p1::detail::insn::i_bit_or<simdpp::arch_sse4p1::uint32<4u, void>, simdpp::arch_sse4p1::mask_int32<4u, void> >(simdpp::arch_sse4p1::uint32<4u, void> const&, simdpp::arch_sse4p1::mask_int32<4u, void> const&)

This can be worked around by changing mask to a simdpp::mask_float32<4> or bitcasting away the mask:
simdpp::bit_cast<simdpp::uint32<4>>((a == 0) & (b == 0))

Fixed in 16524fb. Regression has been introduced in 4073bd5. Thanks!

there's another case of this issue. this admittedly silly code

simdpp::mask_float32x4 foo2b(float a, simdpp::float32x4 b)
{
	return (a <= -1000) & (b > 0);
}

produces:

Inspiration::foo2b(float, simdpp::arch_sse4p1::float32<4u, void>):
0000000000001af0	pushq	%rbp
0000000000001af1	movq	%rsp, %rbp
0000000000001af4	subq	$0x30, %rsp
0000000000001af8	xorl	%eax, %eax
0000000000001afa	ucomiss	0x2df(%rip), %xmm0
0000000000001b01	setbe	%al
0000000000001b04	xorps	%xmm0, %xmm0
0000000000001b07	cmpltps	%xmm1, %xmm0
0000000000001b0b	movaps	%xmm0, -0x20(%rbp)
0000000000001b0f	movl	%eax, -0x8(%rbp)
0000000000001b12	leaq	-0x30(%rbp), %rdi
0000000000001b16	leaq	-0x8(%rbp), %rsi
0000000000001b1a	xorl	%edx, %edx
0000000000001b1c	callq	void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int)
0000000000001b21	movaps	-0x20(%rbp), %xmm0
0000000000001b25	andps	-0x30(%rbp), %xmm0
0000000000001b29	addq	$0x30, %rsp
0000000000001b2d	popq	%rbp
0000000000001b2e	retq
0000000000001b2f	nop
void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int):
0000000000001b30	pushq	%rbp
0000000000001b31	movq	%rsp, %rbp
0000000000001b34	popq	%rbp
0000000000001b35	jmp	void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, 1u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, 1u> const&, unsigned int) ## void simdpp::arch_sse4p1::detail::insn::i_make_const<simdpp::arch_sse4p1::mask_float32<4u, void>, float, u>(simdpp::arch_sse4p1::mask_float32<4u, void>&, simdpp::arch_sse4p1::expr_vec_make_const<float, u> const&, unsigned int)

with a typed more correctly as a float32x4, this produces the expected code.

useless code this may be, but it really should only produce an invalid mask, not crash (unless crashing is within the scope of undefined behavior).