kavon/ghc-llvm

Jump Table Index Introduction

Opened this issue · 3 comments

kavon commented

spectral/hartel/typecheck segfaults due to a jump to an address that is not code. This program will fault even if compiled without GHC optimizations, and the failure happens regularly (only input 600 tested so far).

LBB247_10:                              ##   in Loop: Header=BB247_5 Depth=1
	pushq	%rax
	movq	%rbx, %rax
LBB247_5:                               ## %cbJG
                                        ## =>This Inner Loop Header: Depth=1
	movq	8(%rbp), %r14
	movq	16(%rbp), %rcx
	movl	%eax, %edx
	andl	$7, %edx
	addq	$-2, %rdx
	cmpq	$3, %rdx
	ja	LBB247_7
## BB#6:                                ## %cbJG
                                        ##   in Loop: Header=BB247_5 Depth=1
	movslq	(%rsi,%rdx,4), %rdx
	addq	%rsi, %rdx
	jmpq	*%rdx               #  <<<< sometimes a bad value

What is odd here is that we have an indirect jump was not marked as a TAILCALL by LLVM... and the computation of the address is quite unusual. Currently, I'm trying to figure out how we even get this code in the _rDQ_info$def function.

kavon commented

Here's the output of my debug session:

(lldb) run 600
There is a running process, kill it and restart?: [Y/n] Y
Process 803 exited with status = 9 (0x00000009) 
Process 818 launched: '/Users/kavon/msr/ghc/nofib/spectral/hartel/typecheck/Main' (x86_64)
Process 818 stopped
* thread #1: tid = 0x5088de, 0x000000010000a1ca Main`rDQ_info$def + 186, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
    frame #0: 0x000000010000a1ca Main`rDQ_info$def + 186
Main`rDQ_info$def:
->  0x10000a1ca <+186>: jmpq   *%rdx
    0x10000a1cc <+188>: addq   $0x88, %r12
    0x10000a1d3 <+195>: cmpq   0x358(%r13), %r12
    0x10000a1da <+202>: jbe    0x10000a34d               ; <+573>
(lldb) register read rsi rdx
     rsi = 0x000000010000b184  Main`rDQ_info$def + 4212
     rdx = 0x000000010000a290  Main`rDQ_info$def + 384
(lldb) c
Process 818 resuming
Process 818 stopped
* thread #1: tid = 0x5088de, 0x000000010000a1ca Main`rDQ_info$def + 186, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
    frame #0: 0x000000010000a1ca Main`rDQ_info$def + 186
Main`rDQ_info$def:
->  0x10000a1ca <+186>: jmpq   *%rdx
    0x10000a1cc <+188>: addq   $0x88, %r12
    0x10000a1d3 <+195>: cmpq   0x358(%r13), %r12
    0x10000a1da <+202>: jbe    0x10000a34d               ; <+573>
(lldb) register read rsi rdx
     rsi = 0x000000010000b184  Main`rDQ_info$def + 4212
     rdx = 0x000000010000a230  Main`rDQ_info$def + 288
(lldb) c
Process 818 resuming
Process 818 stopped
* thread #1: tid = 0x5088de, 0x000000010000a1ca Main`rDQ_info$def + 186, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
    frame #0: 0x000000010000a1ca Main`rDQ_info$def + 186
Main`rDQ_info$def:
->  0x10000a1ca <+186>: jmpq   *%rdx
    0x10000a1cc <+188>: addq   $0x88, %r12
    0x10000a1d3 <+195>: cmpq   0x358(%r13), %r12
    0x10000a1da <+202>: jbe    0x10000a34d               ; <+573>
(lldb) register read rsi rdx
     rsi = 0x0000004200005fff
     rdx = 0x0000004200005fff
(lldb) stepi
Process 818 stopped
* thread #1: tid = 0x5088de, 0x0000004200005fff, queue = 'com.apple.main-thread', stop reason = instruction step into
    frame #0: 0x0000004200005fff
->  0x4200005fff: addb   %al, (%rax)
    0x4200006001: addb   %al, (%rax)
    0x4200006003: addb   %al, (%rax)
    0x4200006005: addb   %al, (%rax)
(lldb) 
Process 818 stopped
* thread #1: tid = 0x5088de, 0x0000004200005fff, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=2, address=0x4200005fff)
    frame #0: 0x0000004200005fff
->  0x4200005fff: addb   %al, (%rax)
    0x4200006001: addb   %al, (%rax)
    0x4200006003: addb   %al, (%rax)
    0x4200006005: addb   %al, (%rax)
(lldb) 
kavon commented

It seems the junk value comes in as rsi to the return point (more context below):

	.p2align	4, 0x90
	.quad	_SbVJ_srt$def-L7133701809754910780_1
	.quad	2
	.quad	140733193388062
L7133701809754910780_1:	
LBB247_10:                              ##   in Loop: Header=BB247_5 Depth=1
	pushq	%rax
	movq	%rbx, %rax
LBB247_5:                               ## %cbJG
                                        ## =>This Inner Loop Header: Depth=1
	movq	8(%rbp), %r14
	movq	16(%rbp), %rcx
	movl	%eax, %edx
	andl	$7, %edx
	addq	$-2, %rdx
	cmpq	$3, %rdx
	ja	LBB247_7
## BB#6:                                ## %cbJG
                                        ##   in Loop: Header=BB247_5 Depth=1
	movslq	(%rsi,%rdx,4), %rdx
	addq	%rsi, %rdx
	jmpq	*%rdx
(lldb) run 600
There is a running process, kill it and restart?: [Y/n] y
Process 836 exited with status = 9 (0x00000009) 
Process 840 launched: '/Users/kavon/msr/ghc/nofib/spectral/hartel/typecheck/Main' (x86_64)
Process 840 stopped
* thread #1: tid = 0x50a4af, 0x000000010000a1ac Main`rDQ_info$def + 156, queue = 'com.apple.main-thread', stop reason = breakpoint 4.1
    frame #0: 0x000000010000a1ac Main`rDQ_info$def + 156
Main`rDQ_info$def:
->  0x10000a1ac <+156>: movq   0x8(%rbp), %r14
    0x10000a1b0 <+160>: movq   0x10(%rbp), %rcx
    0x10000a1b4 <+164>: movl   %eax, %edx
    0x10000a1b6 <+166>: andl   $0x7, %edx
(lldb) register read rsi rdx
     rsi = 0x000000010000b184  Main`rDQ_info$def + 4212
     rdx = 0x0000004200000140
(lldb) c
Process 840 resuming
Process 840 stopped
* thread #1: tid = 0x50a4af, 0x000000010000a1ca Main`rDQ_info$def + 186, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
    frame #0: 0x000000010000a1ca Main`rDQ_info$def + 186
Main`rDQ_info$def:
->  0x10000a1ca <+186>: jmpq   *%rdx
    0x10000a1cc <+188>: addq   $0x88, %r12
    0x10000a1d3 <+195>: cmpq   0x358(%r13), %r12
    0x10000a1da <+202>: jbe    0x10000a34d               ; <+573>
(lldb) register read rsi rdx
     rsi = 0x000000010000b184  Main`rDQ_info$def + 4212
     rdx = 0x000000010000a290  Main`rDQ_info$def + 384
(lldb) c
Process 840 resuming
Process 840 stopped
* thread #1: tid = 0x50a4af, 0x000000010000a1ac Main`rDQ_info$def + 156, queue = 'com.apple.main-thread', stop reason = breakpoint 4.1
    frame #0: 0x000000010000a1ac Main`rDQ_info$def + 156
Main`rDQ_info$def:
->  0x10000a1ac <+156>: movq   0x8(%rbp), %r14
    0x10000a1b0 <+160>: movq   0x10(%rbp), %rcx
    0x10000a1b4 <+164>: movl   %eax, %edx
    0x10000a1b6 <+166>: andl   $0x7, %edx
(lldb) register read rsi rdx
     rsi = 0x0000004200005fff
     rdx = 0x0000004200005000
(lldb) c
Process 840 resuming
Process 840 stopped
* thread #1: tid = 0x50a4af, 0x000000010000a1ca Main`rDQ_info$def + 186, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
    frame #0: 0x000000010000a1ca Main`rDQ_info$def + 186
Main`rDQ_info$def:
->  0x10000a1ca <+186>: jmpq   *%rdx
    0x10000a1cc <+188>: addq   $0x88, %r12
    0x10000a1d3 <+195>: cmpq   0x358(%r13), %r12
    0x10000a1da <+202>: jbe    0x10000a34d               ; <+573>
(lldb) register read rsi rdx
     rsi = 0x0000004200005fff
     rdx = 0x0000004200005fff
(lldb) 
kavon commented

The problem is the introduction of a jump table for large switches involved in a loop that contains a cpscall. We see this pattern happen in the following common scenario:

  1. a large switch
  2. the GC strategy says to perform a heap test in each arm of the case. upon failure, we loop back to the switch.

For sufficiently large switches (5 or 6 seems to be enough), the jump table and the index introduced by LLC (the value passed in as rsi in the previous discussion) becomes a new value that is live across the GC call, and is not saved to the GHC stack. In fact, it's left in a register that gets clobbered by the GC.

Here is a snippet of what the jump table dispatch looks like. You'll notice that in the above discussion, the JTI is live in the loop.

	leaq	LJTI0_0(%rip), %rbx
	cmpq	$5, %rdi
	jbe	LBB0_2
	jmp	LBB0_17
LBB0_2:                                 ## %bigSwitch
	movslq	(%rbx,%rdi,4), %rax
	addq	%rbx, %rax
	jmpq	*%rax

	.p2align	2, 0x90
	.data_region jt32
L0_0_set_3 = LBB0_3-LJTI0_0
L0_0_set_7 = LBB0_7-LJTI0_0
L0_0_set_9 = LBB0_9-LJTI0_0
L0_0_set_11 = LBB0_11-LJTI0_0
L0_0_set_13 = LBB0_13-LJTI0_0
L0_0_set_15 = LBB0_15-LJTI0_0
LJTI0_0:
	.long	L0_0_set_3
	.long	L0_0_set_7
	.long	L0_0_set_9
	.long	L0_0_set_11
	.long	L0_0_set_13
	.long	L0_0_set_15
	.end_data_region

Here's the current example I have going. Unfortunately I couldn't force the JTI to be live in the loop, but it's enough to use to start working on a fix. Note also that the jump table is introduced during initial isel (IR -> SelectionDAG) and not a separate optimization pass.

Current workaround to turn off this optimization is to pass -min-jump-table-entries=1000 to raise the threshold so it doesn't happen.

declare i64 @doGC(i64)
@gc_flag = external global i64

define i64 @foo (i64 %tgt) {
entry:
  br label %bigSwitch


bigSwitch:
  %phi_tgt = phi i64 [%tgt, %entry], [%tgt0, %gc0] , [%tgt1, %gc1] , [%tgt2, %gc2] , [%tgt3, %gc3] , [%tgt4, %gc4] , [%tgt5, %gc5] , [%tgt6, %gc6] 
  switch i64 %phi_tgt, label %case6 [i64 0, label %case0
                                  i64 1, label %case1
                                  i64 2, label %case2
                                  i64 3, label %case3
                                  i64 4, label %case4
                                  i64 5, label %case5]


case0:
  %v0 = load i64, i64* @gc_flag
  %c0 = icmp eq i64 0, %v0
  br i1 %c0, label %gc0, label %ret0

gc0:
  %tgt0 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret0:
  ret i64 0

case1:
  %v1 = load i64, i64* @gc_flag
  %c1 = icmp eq i64 1, %v1
  br i1 %c1, label %gc1, label %ret1

gc1:
  %tgt1 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret1:
  ret i64 1

case2:
  %v2 = load i64, i64* @gc_flag
  %c2 = icmp eq i64 2, %v2
  br i1 %c2, label %gc2, label %ret2

gc2:
  %tgt2 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret2:
  ret i64 2

case3:
  %v3 = load i64, i64* @gc_flag
  %c3 = icmp eq i64 3, %v3
  br i1 %c3, label %gc3, label %ret3

gc3:
  %tgt3 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret3:
  ret i64 3

case4:
  %v4 = load i64, i64* @gc_flag
  %c4 = icmp eq i64 4, %v4
  br i1 %c4, label %gc4, label %ret4

gc4:
  %tgt4 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret4:
  ret i64 4

case5:
  %v5 = load i64, i64* @gc_flag
  %c5 = icmp eq i64 5, %v5
  br i1 %c5, label %gc5, label %ret5

gc5:
  %tgt5 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret5:
  ret i64 5

case6:
  %v6 = load i64, i64* @gc_flag
  %c6 = icmp eq i64 6, %v6
  br i1 %c6, label %gc6, label %ret6

gc6:
  %tgt6 = call i64 @doGC(i64 %phi_tgt)
  br label %bigSwitch

ret6:
  ret i64 6

}