CTSRD-CHERI/llvm

unaligned load in struct copy sequence

brooksdavis opened this issue · 2 comments

This bit of assembly is produced:

        daddiu  $1, $16, %got_ofst(PS)
        cfromptr        $c1, $c0, $1
        csetbounds      $c1, $c1, 48            # instantiate a capability to PS
        clc     $c2, $zero, 0($c1)                 # load a capability (works)
        csc     $c2, $zero, 0($c21)
        cincoffset      $c2, $c1, 12               # increment by 12 (???)
        clc     $c2, $zero, 0($c2)                # attempt an unaligned load!
        csc     $c2, $zero, 0($c22)
        clw     $1, $zero, 28($c1)

A somewhat reduced case from usr.bin/sed/process.c (which generates many of these) is:

#define NULL (void *)0

typedef struct {
        char *space;            /* Current space pointer. */
        unsigned long len;             /* Current length. */
        int deleted;            /* If deleted. */
        int append_newline;     /* If originally terminated by \n. */
        char *back;             /* Backing memory. */
} xSPACE;

struct s_command {
        struct s_command *next;
};

extern unsigned long linenum;
static xSPACE HS, PS;

static inline int        applies(struct s_command *);
int      mf_fgets(xSPACE *);

void
xprocess(struct s_command *cp)
{ 
        xSPACE tspace;

        for (linenum = 0; mf_fgets(&PS);) {
                while (cp != NULL) {
                        if (cp->next != NULL) {
                                cp = cp->next;
                                continue;
                        }
                        tspace = PS;
                        PS = HS;
                        PS.append_newline = tspace.append_newline;
                        HS = tspace;
                        break;
                } 

        } /* for all lines */
}

Further reducing the complexity of the loops does still produce wrong code, but somewhat less obviously wrong so I stopped here where the error is unambiguous.
A working command line is:

clang -g -integrated-as --target=cheri-unknown-freebsd  -msoft-float -cheri=128 -G0 -EB -mabi=purecap -O -c testcase.c -o - -S

The issue persists at -O, -O1, and -O2.

This test case appears to have 3 memcpy intrinsics in the basic block that generates the bad instruction sequence:

  call void @llvm.memcpy.p200i8.p200i8.i64(i8 addrspace(200)* nonnull align 16 %tspace.sroa.0.0..sroa_idx10, i8 addrspace(200)* align 16 bitcast (%struct.xSPACE addrspace(200)* @PS to i8 addrspace(200)*), i64 28, i1 false)
  tail call void @llvm.memcpy.p200i8.p200i8.i64(i8 addrspace(200)* align 16 bitcast (%struct.xSPACE addrspace(200)* @PS to i8 addrspace(200)*), i8 addrspace(200)* align 16 bitcast (%struct.xSPACE addrspace(200)* @HS to i8 addrspace(200)*), i64 48, i1 false), !tbaa.struct !10
  call void @llvm.memcpy.p200i8.p200i8.i64(i8 addrspace(200)* align 16 bitcast (%struct.xSPACE addrspace(200)* @HS to i8 addrspace(200)*), i8 addrspace(200)* nonnull align 16 %tspace.sroa.0.0..sroa_idx10, i64 28, i1 false)

This looks as if it's one of the 28-byte ones. SelectionDAG sometimes decides to use overlapping stores, but I thought I'd added a special case for iFATPTR* types telling it that it wasn't allowed to.

Simple test case:

target datalayout = "E-m:e-pf200:128:128-i8:8:32-i16:16:32-i64:64-n32:64-S128-A200"
target triple = "cheri-unknown-freebsd"

%struct.name_t = type { i8 addrspace(200)* }

@x = common local_unnamed_addr addrspace(200) global %struct.name_t zeroinitializer, align 16

; Function Attrs: nounwind
define void @test(%struct.name_t addrspace(200)* %str) local_unnamed_addr #0 {
entry:
  %0 = bitcast %struct.name_t addrspace(200)* %str to i8 addrspace(200)*
  call void @llvm.memcpy.p200i8.p200i8.i64(i8 addrspace(200)* align 16 %0, i8 addrspace(200)* align 16 bitcast (%struct.name_t addrspace(200)* @x to i8 addrspace(200)*), i64 28, i1 false), !tbaa.struct !3
  ret void
}

; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p200i8.p200i8.i64(i8 addrspace(200)* nocapture writeonly, i8 addrspace(200)* nocapture readonly, i64, i1) #1

attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cheri128" "target-features"="+cheri128,+chericap,-noabicalls" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!3 = !{i64 0, i64 16, !4}
!4 = !{!5, !5, i64 0}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C/C++ TBAA"}