`f128` symbols on powerpc64 give inaccurate results

Question

`f128` symbols on powerpc64 give inaccurate results

tgross35 opened this issue a year ago · 4 comments

For some reason, the __addkf3 symbol linked by rustc says that 0x00000000000000000000000000000000 + 0x00000000000000000000000000000001 = 0x00000000000000000000000000000000, when the answer should be 0x00000000000000000000000000000001. Other symbols likely do the wrong thing here too.

Example:

$ powerpc64-linux-gnu-gcc add_test.c -o add_test.c.ppc64
$ rustc add_test.rs -o add_test.rs.ppc64 --target powerpc64-unknown-linux-gnu -Clinker=powerpc64-linux-gnu-gcc

$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64

$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
0000000000000000000000000000000000 0.000000
0000000000000000000000000000000001 0.000000
0000000000000000000000000000000001 0.000000

$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
[add_test.rs:12:5] a = 0x00000000000000000000000000000000
[add_test.rs:12:5] b = 0x00000000000000000000000000000001
[add_test.rs:14:5] c = 0x00000000000000000000000000000000

Our Rust code emits __addkf3 which calls the hardware f128 addition routine xsaddup. The part I cannot explain is that the C version also emits __addkf3 which also calls xsaddup, but somehow produces a correct result. I checked with GDB to make sure they are both hitting xsaddup, so can't really explain the discrepancy.

Original notes at rust-lang/compiler-builtins#606 (comment), discussion on Zulip at https://rust-lang.zulipchat.com/#narrow/stream/122651-general/topic/f128.20system.20libraries.20noncompliant.20platforms/near/438486364.

Full code copied from rust-lang/compiler-builtins#606 (comment):

Rust code

#![feature(f128)]

#[no_mangle]
#[inline(never)]
fn add_entry(a: f128, b: f128) -> f128 {
    a + b
}

fn main() {
    let a = f128::from_bits(0x0);
    let b = f128::from_bits(0x1);
    dbg!(a, b);
    let c = add_entry(a, b);
    dbg!(c);
}

C code

#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

#define _Float128 __float128

typedef struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
    uint64_t lower, upper;
#elif __BYTE_ORDER == __BIG_ENDIAN
    uint64_t upper, lower;
#else
#error missing endian check
#endif
} __attribute__((aligned(_Alignof(_Float128)))) u128;

_Float128 __addkf3(_Float128, _Float128);

void f128_print(_Float128 val) {
    u128 ival = *((u128 *)(&val));
    printf("%#018" PRIx64 "%016" PRIx64 " %lf\n", ival.upper, ival.lower, (double)val);
}

_Float128 new_f128(uint64_t upper, uint64_t lower) {
    u128 val = { .lower = lower, .upper = upper };
    return *((_Float128 *)(&val));
}

_Float128 add_entry(_Float128 a, _Float128 b) {
#ifdef USE_ADDKF3
  return __addkf3(a, b);
#else
  return a + b;
#endif
}

int main() {
    _Float128 a = new_f128(0x0000000000000000, 0x0000000000000000);
    _Float128 b = new_f128(0x0000000000000000, 0x0000000000000001);
    f128_print(a);
    f128_print(b);
    _Float128 c = add_entry(a, b);
    f128_print(c);

    return 0;
}

Assembly generated from Rust (incorrect result)

0000000000010d00 <.add_entry>:
   10d00:   7c 08 02 a6     mflr    r0
   10d04:   f8 21 ff 91     stdu    r1,-112(r1)
   10d08:   f8 01 00 80     std     r0,128(r1)
   10d0c:   4b ff c6 95     bl      d3a0 <00000143.plt_call.__addkf3>
   10d10:   e8 41 00 28     ld      r2,40(r1)
   10d14:   38 21 00 70     addi    r1,r1,112
   10d18:   e8 01 00 10     ld      r0,16(r1)
   10d1c:   7c 08 03 a6     mtlr    r0
   10d20:   4e 80 00 20     blr

000000000000d3a0 <00000143.plt_call.__addkf3>:
    d3a0:       f8 41 00 28     std     r2,40(r1)
    d3a4:       3d 62 ff ff     addis   r11,r2,-1
    d3a8:       e9 8b 7f 58     ld      r12,32600(r11)
    d3ac:       7d 89 03 a6     mtctr   r12
    d3b0:       e8 4b 7f 60     ld      r2,32608(r11)
    d3b4:       4e 80 04 20     bctr

0000000000056790 <.__addkf3_resolve>:
   56790:       81 2d 8f 9c     lwz     r9,-28772(r13)
   56794:       75 29 00 40     andis.  r9,r9,64
   56798:       41 82 00 18     beq     567b0 <.__addkf3_resolve+0x20>
   5679c:       e8 62 80 10     ld      r3,-32752(r2)
   567a0:       4e 80 00 20     blr
   567a4:       60 00 00 00     nop
   567a8:       60 00 00 00     nop
   567ac:       60 42 00 00     ori     r2,r2,0
   567b0:       e8 62 80 18     ld      r3,-32744(r2)
   567b4:       4e 80 00 20     blr
        ...
   567c4:       60 00 00 00     nop
   567c8:       60 00 00 00     nop
   567cc:       60 42 00 00     ori     r2,r2,0

000000000005e6e0 <.__addkf3_hw>:
   5e6e0:       fc 42 18 08     xsaddqp v2,v2,v3
   5e6e4:       4e 80 00 20     blr
        ...
   5e6f4:       60 00 00 00     nop
   5e6f8:       60 00 00 00     nop
   5e6fc:       60 00 00 00     nop

0000000000057050 <.__addkf3_sw>:
   57050:       fb 41 ff d0     std     r26,-48(r1)
   57054:       fb 61 ff d8     std     r27,-40(r1)
   57058:       fb 81 ff e0     std     r28,-32(r1)
   5705c:       fb a1 ff e8     std     r29,-24(r1)
   57060:       fb c1 ff f0     std     r30,-16(r1)
   57064:       fb e1 ff f8     std     r31,-8(r1)
   57068:       f8 21 ff 41     stdu    r1,-192(r1)
   5706c:       39 21 00 70     addi    r9,r1,112
   57070:       39 41 00 70     addi    r10,r1,112
   // ... full sw implementation

Assembly generated from C (correct result)

0000000010000af8 <.add_entry>:
    10000af8:   7c 08 02 a6     mflr    r0
    10000afc:   f8 01 00 10     std     r0,16(r1)
    10000b00:   fb e1 ff f8     std     r31,-8(r1)
    10000b04:   f8 21 ff 81     stdu    r1,-128(r1)
    10000b08:   7c 3f 0b 78     mr      r31,r1
    10000b0c:   39 20 00 30     li      r9,48
    10000b10:   39 5f 00 80     addi    r10,r31,128
    10000b14:   7c 4a 4f 99     stxvd2x vs34,r10,r9
    10000b18:   39 20 00 40     li      r9,64
    10000b1c:   39 5f 00 80     addi    r10,r31,128
    10000b20:   7c 6a 4f 99     stxvd2x vs35,r10,r9
    10000b24:   39 20 00 40     li      r9,64
    10000b28:   39 5f 00 80     addi    r10,r31,128
    10000b2c:   7c 6a 4e 99     lxvd2x  vs35,r10,r9
    10000b30:   39 20 00 30     li      r9,48
    10000b34:   39 5f 00 80     addi    r10,r31,128
    10000b38:   7c 4a 4e 99     lxvd2x  vs34,r10,r9
    10000b3c:   4b ff fb a5     bl      100006e0 <00000019.plt_call.__addkf3>
    10000b40:   e8 41 00 28     ld      r2,40(r1)
    10000b44:   f0 02 14 96     xxmr    vs0,vs34
    10000b48:   f0 40 04 91     xxmr    vs34,vs0
    10000b4c:   38 3f 00 80     addi    r1,r31,128
    10000b50:   e8 01 00 10     ld      r0,16(r1)
    10000b54:   7c 08 03 a6     mtlr    r0
    10000b58:   eb e1 ff f8     ld      r31,-8(r1)
    10000b5c:   4e 80 00 20     blr
    10000b60:   00 00 00 00     .long 0x0
    10000b64:   00 00 00 01     .long 0x1
    10000b68:   80 01 00 01     lwz     r0,1(r1)

0000000010000c40 <.__addkf3_resolve>:
    10000c40:   81 2d 8f 9c     lwz     r9,-28772(r13)
    10000c44:   75 29 00 40     andis.  r9,r9,64
    10000c48:   41 82 00 18     beq     10000c60 <.__addkf3_resolve+0x20>
    10000c4c:   e8 62 80 10     ld      r3,-32752(r2)
    10000c50:   4e 80 00 20     blr
    10000c54:   60 00 00 00     nop
    10000c58:   60 00 00 00     nop
    10000c5c:   60 42 00 00     ori     r2,r2,0
    10000c60:   e8 62 80 18     ld      r3,-32744(r2)
    10000c64:   4e 80 00 20     blr
        ...
    10000c74:   60 00 00 00     nop
    10000c78:   60 00 00 00     nop
    10000c7c:   60 42 00 00     ori     r2,r2,0

0000000010008b90 <.__addkf3_hw>:
    10008b90:   fc 42 18 08     xsaddqp v2,v2,v3
    10008b94:   4e 80 00 20     blr
        ...
    10008ba4:   60 00 00 00     nop
    10008ba8:   60 00 00 00     nop
    10008bac:   60 00 00 00     nop

0000000010001500 <.__addkf3_sw>:
    10001500:   fb 41 ff d0     std     r26,-48(r1)
    10001504:   fb 61 ff d8     std     r27,-40(r1)
    10001508:   fb 81 ff e0     std     r28,-32(r1)
    1000150c:   fb a1 ff e8     std     r29,-24(r1)
    10001510:   fb c1 ff f0     std     r30,-16(r1)
    10001514:   fb e1 ff f8     std     r31,-8(r1)
    10001518:   f8 21 ff 41     stdu    r1,-192(r1)
    1000151c:   39 21 00 70     addi    r9,r1,112
    10001520:   39 41 00 70     addi    r10,r1,112
    // ...

Answer 1 · 2024-05-14T07:22:59.000Z

cc @ecnelises

@rustbot label +F-f16_and_f128 +A-ABI +T-libs +C-bug -needs-triage

Answer 2 · 2024-05-16T18:53:07.000Z

This is being caused by a calling convention mismatch; the Rust version passes/returns the f128s in GPRs, but the builtin expects them to be in vector registers. This is because the ABI of f128 on PowerPC depends on whether the vsx target feature has been enabled (the Rust target has it disabled by default): GCC gets around this by not supporting _Float128 when the vsx target feature is disabled.

Answer 3 · 2024-05-16T19:28:53.000Z

That is interesting. Any idea what is best to do here?

Answer 4 · 2024-05-16T20:51:33.000Z

This is the equivalent of #116344 but for PowerPC; LLVM will generate builtins calls using whatever target features are enabled for the current function. For another example (compiler explorer)

#![feature(f16, f128)]

#[inline(never)]
pub fn soft_float(x: f128, dest: &mut f16) {
    *dest = x as f16;
}

#[inline(never)]
#[target_feature(enable = "sse2")]
pub unsafe fn hard_float(x: f128, dest: &mut f16) {
    *dest = x as f16;
}

When the above code is compiled for the i586-unknown-linux-gnu target, both functions will call the __trunctfhf2 builtin but the soft_float function will expect the result to be returned in the ax register and the hard_float function will expect the result to be returned in the xmm0 register.

The possible solutions I can think of are:

Disallow enabling/disabling float-ABI-affecting target features on relevant targets. If the float ABI in use differed from the system builtins ABI, we would need to ensure that the Rust-compiled version from compiler-builtins was used instead. We'd also have to prohibit linking with other Rust or C code that was compiled with the target feature in a different state, as that code would want to use the builtins with a different ABI. This prohibition would be extremely difficult to achieve if the system libc uses the relevant builtins (and was built with different relevant target features), although this is unlikely to be an issue for f16 and f128.
Improve LLVM to allow specifying the target features to use for function calls separately from the target features to use for codegen. This would also help fix #116558. However, this would still leave the question of what to do when the relevant target feature is disabled but the system builtins have it enabled; enabling the target feature for the builtins calls would prevent the code running on a processor without that target feature, so in that case we'd still have to ship our own builtin from compiler-builtins and prohibit linking like in option 1.
Workaround the lack of option 2 by codegen-ing a shim function whenever the target features required for a call differ from the target features of the calling function. This would probably be not great for the optimiser performance, and would have the same issues with builtins requiring more target features as option 2.
Improve LLVM vary the names of builtins based on which relevant target features are in use (or to allow the frontend to do so). This would allow code compiled with different target features to coexist alongside each other.
Workaround the lack of option 4 by codegen-ing explicit calls to builtins with different names when using target features that aren't the same as in the system builtins. This would be the same as option 4 but have worse performance as the optimiser wouldn't understand what the function calls did. It could also be a bit brittle as the optimiser would be theoretically free to introduce calls to the regular builtins using the wrong ABI (although if all the builtins for a specific type were done using explicit function calls this would probably be unlikely to happen in practice).

Option 4 is probably the best solution here. Option 2/3 will probably be required for things like #116558, but that's a separate (but closely related) issue.