Large integer polyfills

We should have polyfills to downlevel u64 and u128 to many u32 instructions, enabling the use of these types without concern for hardware capabilities on eg. mobile ARM platforms. Also consider polyfill for u8 and u16, which could enable things like using enum Ordering from std, see #147.

libm only handles float math, no integer downlevel.

Discussed in #301

^{Originally posted by nazar-pc June 27, 2025}
I faced a small challenge: my CPU code I'm trying to run on a GPU naturally maps onto u64 and u128, but even u64 apparently requires OpCapability Int64, let alone u128.

Now writing polyfills that work around that feels like a job that compiler should do for me, though I appreciate that it at least tells me that there isn't a native way to run that and I should think about what to do with it.

I'm wondering it it'd be possible to either have a compilation flag that automatically inserts polyfills for such types. It will be an explicit opt-in feature, but otherwise require the least amount of effort for those who do need it. I believe LLVM is capable of this.

I'm really not looking forward to writing polyfills for u128 using a bunch of u32s.

Implementation Notes

For anyone investigating this, I can at least paste these relevant code locations:

where all basic math functions are implemented

rust-gpu/crates/rustc_codegen_spirv/src/builder/builder_methods.rs

Lines 1538 to 1588 in 2f915a0

    
           simple_op! { 
        
               add, i_add, 
        
               fold_const { 
        
                   int(a, b) => a.wrapping_add(b) 
        
               } 
        
           } 
        
           // FIXME(eddyb) try to annotate the SPIR-V for `fast` and `algebraic`. 
        
           simple_op! {fadd, f_add} 
        
           simple_op! {fadd_fast, f_add} // fast=normal 
        
           simple_op! {fadd_algebraic, f_add} // algebraic=normal 
        
           simple_op! {sub, i_sub} 
        
           simple_op! {fsub, f_sub} 
        
           simple_op! {fsub_fast, f_sub} // fast=normal 
        
           simple_op! {fsub_algebraic, f_sub} // algebraic=normal 
        
           simple_op! { 
        
               mul, i_mul, 
        
               // HACK(eddyb) `rustc_codegen_ssa` relies on `Builder` methods doing 
        
               // on-the-fly constant-folding, for e.g. intrinsics that copy memory. 
        
               fold_const { 
        
                   int(a, b) => a.wrapping_mul(b) 
        
               } 
        
           } 
        
           simple_op! {fmul, f_mul} 
        
           simple_op! {fmul_fast, f_mul} // fast=normal 
        
           simple_op! {fmul_algebraic, f_mul} // algebraic=normal 
        
           simple_op! {udiv, u_div} 
        
           // Note: exactudiv is UB when there's a remainder, so it's valid to implement as a normal div. 
        
           // TODO: Can we take advantage of the UB and emit something else? 
        
           simple_op! {exactudiv, u_div} 
        
           simple_op! {sdiv, s_div} 
        
           // Same note and TODO as exactudiv 
        
           simple_op! {exactsdiv, s_div} 
        
           simple_op! {fdiv, f_div} 
        
           simple_op! {fdiv_fast, f_div} // fast=normal 
        
           simple_op! {fdiv_algebraic, f_div} // algebraic=normal 
        
           simple_op! {urem, u_mod} 
        
           simple_op! {srem, s_rem} 
        
           simple_op! {frem, f_rem} 
        
           simple_op! {frem_fast, f_rem} // fast=normal 
        
           simple_op! {frem_algebraic, f_rem} // algebraic=normal 
        
           simple_op_unchecked_type! {shl, shift_left_logical} 
        
           simple_op_unchecked_type! {lshr, shift_right_logical} 
        
           simple_op_unchecked_type! {ashr, shift_right_arithmetic} 
        
           simple_op! {unchecked_sadd, i_add} // already unchecked by default 
        
           simple_op! {unchecked_uadd, i_add} // already unchecked by default 
        
           simple_op! {unchecked_ssub, i_sub} // already unchecked by default 
        
           simple_op! {unchecked_usub, i_sub} // already unchecked by default 
        
           simple_op! {unchecked_smul, i_mul} // already unchecked by default 
        
           simple_op! {unchecked_umul, i_mul} // already unchecked by default 
        
           simple_uni_op! {neg, s_negate} 
        
           simple_uni_op! {fneg, f_negate}

with this proc macro just emitting the correct instruction plus some const eval, nothing wrt emulation to be found

rust-gpu/crates/rustc_codegen_spirv/src/builder/builder_methods.rs

Lines 79 to 82 in 2f915a0

    
           self.emit() 
        
               .$inst_name(result_type, None, lhs.def(self), rhs.def(self)) 
        
               .unwrap() 
        
               .with_type(result_type)

I already made some downlevels for count_ones, bit_reverse, count_trailing_zeros and count_leading_zeros here, since spirv requires these instructions to be u32 exclusive.

rust-gpu/crates/rustc_codegen_spirv/src/builder/intrinsics.rs

Lines 361 to 592 in 2f915a0

    
           impl Builder<'_, '_> { 
        
               pub fn count_ones(&self, arg: SpirvValue) -> SpirvValue { 
        
                   let ty = arg.ty; 
        
                   match self.cx.lookup_type(ty) { 
        
                       SpirvType::Integer(bits, false) => { 
        
                           let u32 = SpirvType::Integer(32, false).def(self.span(), self); 
        
                           match bits { 
        
                               8 | 16 => { 
        
                                   let arg = arg.def(self); 
        
                                   let arg = self.emit().u_convert(u32, None, arg).unwrap(); 
        
                                   self.emit().bit_count(u32, None, arg).unwrap() 
        
                               } 
        
                               32 => self.emit().bit_count(u32, None, arg.def(self)).unwrap(), 
        
                               64 => { 
        
                                   let u32_32 = self.constant_u32(self.span(), 32).def(self); 
        
                                   let arg = arg.def(self); 
        
                                   let lower = self.emit().u_convert(u32, None, arg).unwrap(); 
        
                                   let higher = self 
        
                                       .emit() 
        
                                       .shift_right_logical(ty, None, arg, u32_32) 
        
                                       .unwrap(); 
        
                                   let higher = self.emit().u_convert(u32, None, higher).unwrap(); 
        
                                   let lower_bits = self.emit().bit_count(u32, None, lower).unwrap(); 
        
                                   let higher_bits = self.emit().bit_count(u32, None, higher).unwrap(); 
        
                                   self.emit() 
        
                                       .i_add(u32, None, lower_bits, higher_bits) 
        
                                       .unwrap() 
        
                               } 
        
                               _ => { 
        
                                   return self.undef_zombie( 
        
                                       ty, 
        
                                       &format!("count_ones() on unsupported {ty:?} bit integer type"), 
        
                                   ); 
        
                               } 
        
                           } 
        
                           .with_type(u32) 
        
                       } 
        
                       _ => self.fatal(format!( 
        
                           "count_ones() expected an unsigned integer type, got {:?}", 
        
                           self.cx.lookup_type(ty) 
        
                       )), 
        
                   } 
        
               } 
        
               pub fn bit_reverse(&self, arg: SpirvValue) -> SpirvValue { 
        
                   let ty = arg.ty; 
        
                   match self.cx.lookup_type(ty) { 
        
                       SpirvType::Integer(bits, false) => { 
        
                           let u32 = SpirvType::Integer(32, false).def(self.span(), self); 
        
                           let uint = SpirvType::Integer(bits, false).def(self.span(), self); 
        
                           match bits { 
        
                               8 | 16 => { 
        
                                   let arg = arg.def(self); 
        
                                   let arg = self.emit().u_convert(u32, None, arg).unwrap(); 
        
                                   let reverse = self.emit().bit_reverse(u32, None, arg).unwrap(); 
        
                                   let shift = self.constant_u32(self.span(), 32 - bits).def(self); 
        
                                   let reverse = self 
        
                                       .emit() 
        
                                       .shift_right_logical(u32, None, reverse, shift) 
        
                                       .unwrap(); 
        
                                   self.emit().u_convert(uint, None, reverse).unwrap() 
        
                               } 
        
                               32 => self.emit().bit_reverse(u32, None, arg.def(self)).unwrap(), 
        
                               64 => { 
        
                                   let u32_32 = self.constant_u32(self.span(), 32).def(self); 
        
                                   let arg = arg.def(self); 
        
                                   let lower = self.emit().u_convert(u32, None, arg).unwrap(); 
        
                                   let higher = self 
        
                                       .emit() 
        
                                       .shift_right_logical(ty, None, arg, u32_32) 
        
                                       .unwrap(); 
        
                                   let higher = self.emit().u_convert(u32, None, higher).unwrap(); 
        
                                   // note that higher and lower have swapped 
        
                                   let higher_bits = self.emit().bit_reverse(u32, None, lower).unwrap(); 
        
                                   let lower_bits = self.emit().bit_reverse(u32, None, higher).unwrap(); 
        
                                   let higher_bits = self.emit().u_convert(uint, None, higher_bits).unwrap(); 
        
                                   let higher_bits = self 
        
                                       .emit() 
        
                                       .shift_left_logical(uint, None, higher_bits, u32_32) 
        
                                       .unwrap(); 
        
                                   let lower_bits = self.emit().u_convert(uint, None, lower_bits).unwrap(); 
        
                                   self.emit() 
        
                                       .bitwise_or(ty, None, lower_bits, higher_bits) 
        
                                       .unwrap() 
        
                               } 
        
                               _ => { 
        
                                   return self.undef_zombie( 
        
                                       ty, 
        
                                       &format!("bit_reverse() on unsupported {ty:?} bit integer type"), 
        
                                   ); 
        
                               } 
        
                           } 
        
                           .with_type(ty) 
        
                       } 
        
                       _ => self.fatal(format!( 
        
                           "bit_reverse() expected an unsigned integer type, got {:?}", 
        
                           self.cx.lookup_type(ty) 
        
                       )), 
        
                   } 
        
               } 
        
               pub fn count_leading_trailing_zeros( 
        
                   &self, 
        
                   arg: SpirvValue, 
        
                   trailing: bool, 
        
                   non_zero: bool, 
        
               ) -> SpirvValue { 
        
                   let ty = arg.ty; 
        
                   match self.cx.lookup_type(ty) { 
        
                       SpirvType::Integer(bits, false) => { 
        
                           let bool = SpirvType::Bool.def(self.span(), self); 
        
                           let u32 = SpirvType::Integer(32, false).def(self.span(), self); 
        
                           let glsl = self.ext_inst.borrow_mut().import_glsl(self); 
        
                           let find_xsb = |arg, offset: i32| { 
        
                               if trailing { 
        
                                   let lsb = self 
        
                                       .emit() 
        
                                       .ext_inst(u32, None, glsl, GLOp::FindILsb as u32, [Operand::IdRef( 
        
                                           arg, 
        
                                       )]) 
        
                                       .unwrap(); 
        
                                   if offset == 0 { 
        
                                       lsb 
        
                                   } else { 
        
                                       let const_offset = self.constant_i32(self.span(), offset).def(self); 
        
                                       self.emit().i_add(u32, None, const_offset, lsb).unwrap() 
        
                                   } 
        
                               } else { 
        
                                   // rust is always unsigned, so FindUMsb 
        
                                   let msb_bit = self 
        
                                       .emit() 
        
                                       .ext_inst(u32, None, glsl, GLOp::FindUMsb as u32, [Operand::IdRef( 
        
                                           arg, 
        
                                       )]) 
        
                                       .unwrap(); 
        
                                   // the glsl op returns the Msb bit, not the amount of leading zeros of this u32 
        
                                   // leading zeros = 31 - Msb bit 
        
                                   let const_offset = self.constant_i32(self.span(), 31 - offset).def(self); 
        
                                   self.emit().i_sub(u32, None, const_offset, msb_bit).unwrap() 
        
                               } 
        
                           }; 
        
                           let converted = match bits { 
        
                               8 | 16 => { 
        
                                   let arg = self.emit().u_convert(u32, None, arg.def(self)).unwrap(); 
        
                                   if trailing { 
        
                                       find_xsb(arg, 0) 
        
                                   } else { 
        
                                       find_xsb(arg, bits as i32 - 32) 
        
                                   } 
        
                               } 
        
                               32 => find_xsb(arg.def(self), 0), 
        
                               64 => { 
        
                                   let u32_0 = self.constant_int(u32, 0).def(self); 
        
                                   let u32_32 = self.constant_u32(self.span(), 32).def(self); 
        
                                   let arg = arg.def(self); 
        
                                   let lower = self.emit().u_convert(u32, None, arg).unwrap(); 
        
                                   let higher = self 
        
                                       .emit() 
        
                                       .shift_right_logical(ty, None, arg, u32_32) 
        
                                       .unwrap(); 
        
                                   let higher = self.emit().u_convert(u32, None, higher).unwrap(); 
        
                                   if trailing { 
        
                                       let use_lower = self.emit().i_equal(bool, None, higher, u32_0).unwrap(); 
        
                                       let lower_bits = find_xsb(lower, 32); 
        
                                       let higher_bits = find_xsb(higher, 0); 
        
                                       self.emit() 
        
                                           .select(u32, None, use_lower, lower_bits, higher_bits) 
        
                                           .unwrap() 
        
                                   } else { 
        
                                       let use_higher = self.emit().i_equal(bool, None, lower, u32_0).unwrap(); 
        
                                       let lower_bits = find_xsb(lower, 0); 
        
                                       let higher_bits = find_xsb(higher, 32); 
        
                                       self.emit() 
        
                                           .select(u32, None, use_higher, higher_bits, lower_bits) 
        
                                           .unwrap() 
        
                                   } 
        
                               } 
        
                               _ => { 
        
                                   return self.undef_zombie(ty, &format!( 
        
                                       "count_leading_trailing_zeros() on unsupported {ty:?} bit integer type" 
        
                                   )); 
        
                               } 
        
                           }; 
        
                           if non_zero { 
        
                               converted 
        
                           } else { 
        
                               let int_0 = self.constant_int(ty, 0).def(self); 
        
                               let int_bits = self.constant_int(u32, bits as u128).def(self); 
        
                               let is_0 = self 
        
                                   .emit() 
        
                                   .i_equal(bool, None, arg.def(self), int_0) 
        
                                   .unwrap(); 
        
                               self.emit() 
        
                                   .select(u32, None, is_0, int_bits, converted) 
        
                                   .unwrap() 
        
                           } 
        
                           .with_type(u32) 
        
                       } 
        
                       SpirvType::Integer(bits, true) => { 
        
                           // rustc wants `[i8,i16,i32,i64]::leading_zeros()` with `non_zero: true` for some reason. I do not know 
        
                           // how these are reachable, marking them as zombies makes none of our compiletests fail. 
        
                           let unsigned = SpirvType::Integer(bits, false).def(self.span(), self); 
        
                           let arg = self 
        
                               .emit() 
        
                               .bitcast(unsigned, None, arg.def(self)) 
        
                               .unwrap() 
        
                               .with_type(unsigned); 
        
                           let result = self.count_leading_trailing_zeros(arg, trailing, non_zero); 
        
                           self.emit() 
        
                               .bitcast(ty, None, result.def(self)) 
        
                               .unwrap() 
        
                               .with_type(ty) 
        
                       } 
        
                       e => { 
        
                           self.fatal(format!( 
        
                               "count_leading_trailing_zeros(trailing: {trailing}, non_zero: {non_zero}) expected an integer type, got {e:?}", 
        
                           )); 
        
                       } 
        
                   } 
        
               }

I wonder if polyfills should be linker steps instead. Generate all code with all the u8/u16/u64 as is, then have the linker downlevel it to u32 for you. With multimodule, you could control polyfill for each entry point, allowing you to have a u64-supported and a u64-polyfill entry point right next to each other. And your runtime can decide which one to pick depending on device features.

Hm... that is an interesting proposal. I already have two builds: for GPUs that support Int64 and those that do not. If I can compile it once and have post-processing step to add polyfills, it might be cheaper overall. I wouldn't want multiple single-module files though.

Also since I'm using u128s as bit containers, having polyfills on Rust side might be beneficial in case I don't actually use 128 bits, but something like 80, in which case the overall number of, let's say, bit shifts, can potentially be reduced by eliminating dead code by the compiler, which I imagine will not be possible at a later stage, at least not as easily.

Or sometimes bit shift might be known at compile time to be something like 35, in which case some of the words can be ignored completely since they'll be shifted away completely.

So overall, my preference would be to implement it on earlier stage for better performance.

So, there are some interesting things to consider here.

The main problem we'll face is that there is no overflow-checking intrinsic in SPIR-V.

Emulating overflow check

This will force us to emulate overflow checks. This is not too dificult, but it has a cost.

For unsinged numbers, this would look something like this:

// Unsigned addition overflow check
let res = a + b;
let ovf = res < a | b;

So, an added compare and or to detect overflows. Together, it would work something like this:

fn emulated_u64_add(lhs:[u32;2], rhs:[u32;2])->[u32;2]{
    let low_res = lhs[0] + rhs[0];
    let ovf = res < lhs[0] | rhs[0];
    let high_res = lhs[1] + rhs[1] + 1;
    [low_res, high_res] 
}

So, we go from one 64 bit op, to 1 + 2 + 2 = 5 32 bit ops. Not too bad. With overflow-chcecking intrinsics(like LLVM) has, we'd only need 3 ops, but this is still pretty darn good.

128 bit ints.

Now, I think it is important to mention that this is more or less how 128 bit ints work in Rust ATM. We emulate them on all major plarforms.

On x86_64:

#[unsafe(no_mangle)]
pub fn sum(a: i128, b:i128) -> i128 {
    a + b
}

sum:
        mov     rax, rdi
        add     rax, rdx ; Add low half, set overflow flag
        adc     rsi, rcx ; Add high half, and also the overflow flag.
        mov     rdx, rsi
        ret

On some platforms(32 bit riscV), we also emulate 64 bit ints. So, this is nothing new.

Emulating specific ops

Now, I will go over each kind of op, and what we can do to emulate them. For now, I will only foccus on ints larger than supported: smaller ints require some qptr-adjecent consideration.

Addition: by halves, with overflow checks.

For types larger than our max supported int width, we will implement the alogrithm demonstrated in emulated_u64_add.

Due to how Rust is designed, this op is bitwise-indentical for the singed version: emulated_i64_add can be implemented in the same way emulated_u64_add is.

Really, we only need to worry about the sign for div/mod operations, some bitshifts, and for comparisons. All else can be the same for signed and unsinged ints.

Subtraction: by halfes, with overflow checks

Subtraction is usually implemented in a way that is similar to addition(subtract halves, check for overflow, move the "borrow" flag). I'll see if there is a more efficent way to do this.

Bitwise-ops(excluding shifts)

Bitwise and, or, xor, not operate on individual bytes, and can be implemented by operating on each half separetely. So, one 64 bit op can be lowered to 2 32 bit ones.

fn or_u64(lhs:[u32;2], rhs:[u32;2])->[u32;2]{
    [lhs[0] | rhs[0], lhs[1] | rhs[1]]
}

Bitshifts

Bitshifts are another can of worms entriely. The individual "halves" interact in ways that are a bit hard to reason about.

I have a rough idea for an alogrithm that does this using a couple of 32 bit shifts, but I belive I can get more performance out of this with a bit of work.

Here, I think we would have to(in order to get decent performance) at least try some more clever tricks for constant shifts.

Multiplication

Emulating multiplication is also going to be a tad bit tricky. Really, we have two options, none of them paritculary good.

Long multiplication

We don't have good way to detect the "overflow"(higher half) from a multiplication operation.

2^32 * 2^32 is 2^64, so, to see the full result of a 32 bit multiplication, we need to have 64 bit multiplication. So, we can't use tricks exactly like the ones we used for addition here.

Instead, we will have to treat the 64 bit number as a 4-digit, base-2^16 number. Then, we can use long multiplication here.
Feels a bit odd to use primary-school match at this point of my life, but it will work.

Maybe there is a better way to do this: the alogirthm will be quite complex, needing 16 32 bit multiplies to emulate one 64 bit one.

"Multiplying" one bit at a time.

We can lower bibary multiplication to a series of bitshifts, ands, and additons.

let result = 0;
while (rhs != 0 && lhs != 0){
    // Is this bit set?
    if rhs & 1 == 1 {
        result += lhs;
    }
    // Use shift to multiply lhs by 2
    lhs = lhs << 1;
    // use shift to extract the next digit of rhs
    rhs = rhs >> 1;
}

This has some big disadvantages - it is significantly more expensive in the worst-case scenario. However, it can be faster for smaller multiplications.

Division + Modulo

For division, implementing it will be a headache. In LLVM Rust, i128 bit division is emulated by just calling __divti3 and other such intrinsics.

I think that the first implementation should avoid it alltogether, and it should be handled in a separate PR.

Implementation

Some people mentioned concerns about implementing u128 ops using u32 ops. I think we can avoid doing so directly.

I see the easiest possible implementation as this.

We have functions to emulate an intiger op, like this:

// On builder
fn emulate_add(&self, lhs:Self::Value, rhs:Self::Value, type_:Self::Type)->Self::Value{
    // Implement the emulated addition using smaller types
}

This function can implement a u128 op using u64 ops, and u64 ops using u32 ops.

On platforms where u128 and u64 is not supported, we first lower u128 ops to u64 ops, and then lower those ops to u32.

This should be easy to implement and reason about.

Now, we will go to the more difficult topic: smaller types.

Layout issues

Suprisingly, the problem with u8 / u16 is not with implementing the ops themselves: that is laughably easy. We just need a bitmask:

// Passed as larger intigers
fn emu_u8_add(lhs:u32, rhs:u32)->u32{
    (lhs + rhs) & 0xFF
}

Same thing can be done for more or less all ops.

We can later optimize this a bit more(by removing some bitmasks for op sequences), but this is more or less a solved problem.

The real issue here is layouts.

Consider this type:

#[repr(C)]
struct MyType{
    a:u8,
    b:u8
}

Now, per the Rust book, this type should have the size of 2. However, if we replace the u8s with u16, this will no longer be the case.

// This has a different layout!
#[repr(C)]
struct MyType{
    a:u16,
    b:u16,
}

This is a big problem for exchanging data with the GPU(wrong type layout), and for ordinary Rust code.

This operation is safe:

2_u16.to_le_bytes() // Transmutes one u16 to 2 u8s.

A lot of Rust code assumes size_of::<u8>() is 1. From talking to other Rust folk(this came up with my GCC work), this is pretty much set in stone.

Rust also promises that a byte has 8 bits, so we can't really try any clever tricks here. Without support for u8 on the GPU side, our u8 would be at least 2 bytes(maybe even 4) big.

That will break a lot of code.

The big question here is: is this something that qptr can handle and allow us to recover from? Can it "pretend" that this type has the layout we want?

Emulating smaller types through larger types is more esoteric and I'd separate these concerns.

For enums like Ordering I'm not actually sure it is a public API/documented guarantee that it is #[repr(i8)]. I think it might be okay that when compiled without Int8 capability it is increased to the next smallest applicable integer (like i32). It is a really annoying situation to be in though as there are probably many such cases in the standard library.

It is repr(i8), which means you can do this:

fn ord_to_i8(ord:std::cmp::Ordering)->i8{
    ord as i8
}

It is also documented as having specific values:

#[repr(i8)]
pub enum Ordering {
    Less = -1,
    Equal = 0,
    Greater = 1,
}

The Rust compiler itself relies on this: BinOp::Cmp is defined to return ordering, with scalar values -1_i8, 0_i8, 1_i8.

https://doc.rust-lang.org/beta/nightly-rustc/rustc_middle/mir/enum.BinOp.html#variant.Cmp

So, we can't easily change that.

Still, I agree that larger integers are easier to do than smaller ones. We ought to start with that.

This also could be useful to support physical pointers, as demonstrated at this Vulkan documentation page

About handling u8 and u16 integers within structs: If they're only a local variable, we can just widen them into u32s and emulate the math ops like @FractalFir described. This would already allow you to do a comparison and compute something with the Ordinal returned. Only if the struct is written to or read from global memory is when we actually need to be careful.

Also note that doing computation with 8/16/64bits is a completely different feature from reading / writing 8/16/64bits. You could have GPUs with support for 64bit read/writes but no native 64bit integer ALU hardware. So you could get quite far with just instruction level emulation.

At least 8/16bit loads should be quite easy to emulate, just load 32bits, shift and bitmask. Only potential issue may be buffer sizes not aligned to multiples of 4, though I'd say that would be quite weird on platforms that only support 32bit loads. Stores are a lot more difficult, since you can't just load the 32bit value, overwrite part of it and store it again. You could have parallel threads writing values right next to yours, making the writes race, and the vulkan sync model has the option that you may only write into a buffer but never read from it.

computation
- 8bit: extension VK_KHR_shader_float16_int8 with feature shaderInt8
- 16bit: core 1.0 feature shaderInt16
- 64bit: core 1.0 feature shaderInt64
storage (with various features for the different storage types, most importantly storage buffers)
- 8bit: extension VK_KHR_8bit_storage
- 16bit: extension VK_KHR_16bit_storage
- 64bit: ???
atomics:
- 64bit: VK_KHR_shader_atomic_int64

About handling u8 and u16 integers within structs: If they're only a local variable, we can just widen them into u32s and emulate the math ops like @FractalFir described. This would already allow you to do a comparison and compute something with the Ordinal returned. Only if the struct is written to or read from global memory is when we actually need to be careful.

This would work... but then the emulation would have to be a later pass, I guess. And the question really becomes: when should this emulation happen. I guess after mem-to-reg would be the best place. That complicates things, but is doable.

I think the OR is unnecessary in this code:

// Unsigned addition overflow check
let res = a + b;
let ovf = res < a | b;

If res overflows then it will be (a + b) mod 2^n = (a+b) - 2^n which means both a and b are smaller than 2^n, so in order to check for overflow it's enough to just check if res is smaller than either a or b, leading to:

let res = a + b;
let ovf = res < a; // or res < b

If I'm right the "price" goes down from 5 to 4 32-bit ops. :)

	simple_op! {
	add, i_add,
	fold_const {
	int(a, b) => a.wrapping_add(b)
	}
	}
	// FIXME(eddyb) try to annotate the SPIR-V for `fast` and `algebraic`.
	simple_op! {fadd, f_add}
	simple_op! {fadd_fast, f_add} // fast=normal
	simple_op! {fadd_algebraic, f_add} // algebraic=normal
	simple_op! {sub, i_sub}
	simple_op! {fsub, f_sub}
	simple_op! {fsub_fast, f_sub} // fast=normal
	simple_op! {fsub_algebraic, f_sub} // algebraic=normal
	simple_op! {
	mul, i_mul,
	// HACK(eddyb) `rustc_codegen_ssa` relies on `Builder` methods doing
	// on-the-fly constant-folding, for e.g. intrinsics that copy memory.
	fold_const {
	int(a, b) => a.wrapping_mul(b)
	}
	}
	simple_op! {fmul, f_mul}
	simple_op! {fmul_fast, f_mul} // fast=normal
	simple_op! {fmul_algebraic, f_mul} // algebraic=normal
	simple_op! {udiv, u_div}
	// Note: exactudiv is UB when there's a remainder, so it's valid to implement as a normal div.
	// TODO: Can we take advantage of the UB and emit something else?
	simple_op! {exactudiv, u_div}
	simple_op! {sdiv, s_div}
	// Same note and TODO as exactudiv
	simple_op! {exactsdiv, s_div}
	simple_op! {fdiv, f_div}
	simple_op! {fdiv_fast, f_div} // fast=normal
	simple_op! {fdiv_algebraic, f_div} // algebraic=normal
	simple_op! {urem, u_mod}
	simple_op! {srem, s_rem}
	simple_op! {frem, f_rem}
	simple_op! {frem_fast, f_rem} // fast=normal
	simple_op! {frem_algebraic, f_rem} // algebraic=normal
	simple_op_unchecked_type! {shl, shift_left_logical}
	simple_op_unchecked_type! {lshr, shift_right_logical}
	simple_op_unchecked_type! {ashr, shift_right_arithmetic}
	simple_op! {unchecked_sadd, i_add} // already unchecked by default
	simple_op! {unchecked_uadd, i_add} // already unchecked by default
	simple_op! {unchecked_ssub, i_sub} // already unchecked by default
	simple_op! {unchecked_usub, i_sub} // already unchecked by default
	simple_op! {unchecked_smul, i_mul} // already unchecked by default
	simple_op! {unchecked_umul, i_mul} // already unchecked by default
	simple_uni_op! {neg, s_negate}
	simple_uni_op! {fneg, f_negate}

	self.emit()
	.$inst_name(result_type, None, lhs.def(self), rhs.def(self))
	.unwrap()
	.with_type(result_type)

	impl Builder<'_, '_> {
	pub fn count_ones(&self, arg: SpirvValue) -> SpirvValue {
	let ty = arg.ty;
	match self.cx.lookup_type(ty) {
	SpirvType::Integer(bits, false) => {
	let u32 = SpirvType::Integer(32, false).def(self.span(), self);

	match bits {
	8 \| 16 => {
	let arg = arg.def(self);
	let arg = self.emit().u_convert(u32, None, arg).unwrap();
	self.emit().bit_count(u32, None, arg).unwrap()
	}
	32 => self.emit().bit_count(u32, None, arg.def(self)).unwrap(),
	64 => {
	let u32_32 = self.constant_u32(self.span(), 32).def(self);
	let arg = arg.def(self);
	let lower = self.emit().u_convert(u32, None, arg).unwrap();
	let higher = self
	.emit()
	.shift_right_logical(ty, None, arg, u32_32)
	.unwrap();
	let higher = self.emit().u_convert(u32, None, higher).unwrap();

	let lower_bits = self.emit().bit_count(u32, None, lower).unwrap();
	let higher_bits = self.emit().bit_count(u32, None, higher).unwrap();
	self.emit()
	.i_add(u32, None, lower_bits, higher_bits)
	.unwrap()
	}
	_ => {
	return self.undef_zombie(
	ty,
	&format!("count_ones() on unsupported {ty:?} bit integer type"),
	);
	}
	}
	.with_type(u32)
	}
	_ => self.fatal(format!(
	"count_ones() expected an unsigned integer type, got {:?}",
	self.cx.lookup_type(ty)
	)),
	}
	}

	pub fn bit_reverse(&self, arg: SpirvValue) -> SpirvValue {
	let ty = arg.ty;
	match self.cx.lookup_type(ty) {
	SpirvType::Integer(bits, false) => {
	let u32 = SpirvType::Integer(32, false).def(self.span(), self);
	let uint = SpirvType::Integer(bits, false).def(self.span(), self);

	match bits {
	8 \| 16 => {
	let arg = arg.def(self);
	let arg = self.emit().u_convert(u32, None, arg).unwrap();

	let reverse = self.emit().bit_reverse(u32, None, arg).unwrap();
	let shift = self.constant_u32(self.span(), 32 - bits).def(self);
	let reverse = self
	.emit()
	.shift_right_logical(u32, None, reverse, shift)
	.unwrap();
	self.emit().u_convert(uint, None, reverse).unwrap()
	}
	32 => self.emit().bit_reverse(u32, None, arg.def(self)).unwrap(),
	64 => {
	let u32_32 = self.constant_u32(self.span(), 32).def(self);
	let arg = arg.def(self);
	let lower = self.emit().u_convert(u32, None, arg).unwrap();
	let higher = self
	.emit()
	.shift_right_logical(ty, None, arg, u32_32)
	.unwrap();
	let higher = self.emit().u_convert(u32, None, higher).unwrap();

	// note that higher and lower have swapped
	let higher_bits = self.emit().bit_reverse(u32, None, lower).unwrap();
	let lower_bits = self.emit().bit_reverse(u32, None, higher).unwrap();

	let higher_bits = self.emit().u_convert(uint, None, higher_bits).unwrap();
	let higher_bits = self
	.emit()
	.shift_left_logical(uint, None, higher_bits, u32_32)
	.unwrap();
	let lower_bits = self.emit().u_convert(uint, None, lower_bits).unwrap();

	self.emit()
	.bitwise_or(ty, None, lower_bits, higher_bits)
	.unwrap()
	}
	_ => {
	return self.undef_zombie(
	ty,
	&format!("bit_reverse() on unsupported {ty:?} bit integer type"),
	);
	}
	}
	.with_type(ty)
	}
	_ => self.fatal(format!(
	"bit_reverse() expected an unsigned integer type, got {:?}",
	self.cx.lookup_type(ty)
	)),
	}
	}

	pub fn count_leading_trailing_zeros(
	&self,
	arg: SpirvValue,
	trailing: bool,
	non_zero: bool,
	) -> SpirvValue {
	let ty = arg.ty;
	match self.cx.lookup_type(ty) {
	SpirvType::Integer(bits, false) => {
	let bool = SpirvType::Bool.def(self.span(), self);
	let u32 = SpirvType::Integer(32, false).def(self.span(), self);

	let glsl = self.ext_inst.borrow_mut().import_glsl(self);
	let find_xsb = \|arg, offset: i32\| {
	if trailing {
	let lsb = self
	.emit()
	.ext_inst(u32, None, glsl, GLOp::FindILsb as u32, [Operand::IdRef(
	arg,
	)])
	.unwrap();
	if offset == 0 {
	lsb
	} else {
	let const_offset = self.constant_i32(self.span(), offset).def(self);
	self.emit().i_add(u32, None, const_offset, lsb).unwrap()
	}
	} else {
	// rust is always unsigned, so FindUMsb
	let msb_bit = self
	.emit()
	.ext_inst(u32, None, glsl, GLOp::FindUMsb as u32, [Operand::IdRef(
	arg,
	)])
	.unwrap();
	// the glsl op returns the Msb bit, not the amount of leading zeros of this u32
	// leading zeros = 31 - Msb bit
	let const_offset = self.constant_i32(self.span(), 31 - offset).def(self);
	self.emit().i_sub(u32, None, const_offset, msb_bit).unwrap()
	}
	};

	let converted = match bits {
	8 \| 16 => {
	let arg = self.emit().u_convert(u32, None, arg.def(self)).unwrap();
	if trailing {
	find_xsb(arg, 0)
	} else {
	find_xsb(arg, bits as i32 - 32)
	}
	}
	32 => find_xsb(arg.def(self), 0),
	64 => {
	let u32_0 = self.constant_int(u32, 0).def(self);
	let u32_32 = self.constant_u32(self.span(), 32).def(self);

	let arg = arg.def(self);
	let lower = self.emit().u_convert(u32, None, arg).unwrap();
	let higher = self
	.emit()
	.shift_right_logical(ty, None, arg, u32_32)
	.unwrap();
	let higher = self.emit().u_convert(u32, None, higher).unwrap();

	if trailing {
	let use_lower = self.emit().i_equal(bool, None, higher, u32_0).unwrap();
	let lower_bits = find_xsb(lower, 32);
	let higher_bits = find_xsb(higher, 0);
	self.emit()
	.select(u32, None, use_lower, lower_bits, higher_bits)
	.unwrap()
	} else {
	let use_higher = self.emit().i_equal(bool, None, lower, u32_0).unwrap();
	let lower_bits = find_xsb(lower, 0);
	let higher_bits = find_xsb(higher, 32);
	self.emit()
	.select(u32, None, use_higher, higher_bits, lower_bits)
	.unwrap()
	}
	}
	_ => {
	return self.undef_zombie(ty, &format!(
	"count_leading_trailing_zeros() on unsupported {ty:?} bit integer type"
	));
	}
	};

	if non_zero {
	converted
	} else {
	let int_0 = self.constant_int(ty, 0).def(self);
	let int_bits = self.constant_int(u32, bits as u128).def(self);
	let is_0 = self
	.emit()
	.i_equal(bool, None, arg.def(self), int_0)
	.unwrap();
	self.emit()
	.select(u32, None, is_0, int_bits, converted)
	.unwrap()
	}
	.with_type(u32)
	}
	SpirvType::Integer(bits, true) => {
	// rustc wants `[i8,i16,i32,i64]::leading_zeros()` with `non_zero: true` for some reason. I do not know
	// how these are reachable, marking them as zombies makes none of our compiletests fail.
	let unsigned = SpirvType::Integer(bits, false).def(self.span(), self);
	let arg = self
	.emit()
	.bitcast(unsigned, None, arg.def(self))
	.unwrap()
	.with_type(unsigned);
	let result = self.count_leading_trailing_zeros(arg, trailing, non_zero);
	self.emit()
	.bitcast(ty, None, result.def(self))
	.unwrap()
	.with_type(ty)
	}
	e => {
	self.fatal(format!(
	"count_leading_trailing_zeros(trailing: {trailing}, non_zero: {non_zero}) expected an integer type, got {e:?}",
	));
	}
	}
	}