Alternative to uint128

The 64-bit fiat-c files rely on uint128, which doesn't work in MSVC. Instead, MSVC has its own intrinsics. (Irritatingly, they're different between x86_64 and aarch64. There's _umul128 on x86_64 and then __umulh on aarch64.)

@andres-erbsen mentioned the --split-multiret exists which would get most of the way there, so if we could regenerate fiat-c with that, then we could probably, at least as a start, just patch out the resulting multiplication intrinsic and see how it works. And then perhaps, from there, figure out how to make the output do it automatically.

(Some years ago, before Fiat outputted nice standalone files, Andres and I tried this and we instead crashed MSVC. But that was an older MSVC and Fiat's output has since changed, so I'm hopeful that it'll work now.)

On a second thought, --no-wide-int is the more appropriate flag. Even so, using it makes x25519 2x slower on gcc12/zen3/linux. Using an intrinsic for adx does not seem to help. Using an intrinsic for mulx makes compilation fail with an inlining error; removing __inline__ makes x25519 even slower still.

I don't have any great ideas for how to troubleshoot this; possible root causes range from utter compiler silliness to old code only compiling well due to algebraic reasoning over uint128. I'll try out some assembly code instead.

Hmm, that's irritating. I suppose we could generate both, but surely this is something any decent compiler could figure out. Do you have a copy of the new file?

Correction: code generated using --no-wide-int is on par with code generated without that flag on gcc iff addcarryx is replaced with an intrinsic and boringssl is built in release mode.

Assuming we want to match https://boringssl.googlesource.com/boringssl/+/aa31748bc84f0cd499e4b0337bf929b132aab1cc, we could add a --msvc flag that enables an adjustment to

fiat-crypto/src/Stringification/C.v

Lines 116 to 166 in 457cb5e

    
           Definition header 
        
                {language_naming_conventions : language_naming_conventions_opt} 
        
                {documentation_options : documentation_options_opt} 
        
                {package_namev : package_name_opt} 
        
                {class_namev : class_name_opt} 
        
                {output_options : output_options_opt} 
        
                (machine_wordsize : Z) (internal_static : bool) (static : bool) (prefix : string) (infos : ident_infos) 
        
                (typedef_map : list typedef_info) 
        
           : list string 
        
             := let bitwidths_used := bitwidths_used infos in 
        
                let value_barrier_bitwidths := value_barrier_bitwidths infos in 
        
                let FIAT_EXTENSION := (String.to_upper prefix ++ "FIAT_EXTENSION")%string in 
        
                ([""; 
        
                 "#include <stdint.h>"] 
        
                   ++ (if IntSet.mem _Bool bitwidths_used || IntSet.mem (int.signed_counterpart_of _Bool) bitwidths_used 
        
                       then ["typedef unsigned char " ++ int.type.to_string prefix _Bool ++ ";"; 
        
                               "typedef signed char " ++ int.type.to_string prefix (int.signed_counterpart_of _Bool) ++ ";"]%string 
        
                       else []) 
        
                   ++ (let gnuc_defines 
        
                           := (if IntSet.mem uint128 bitwidths_used || IntSet.mem int128 bitwidths_used 
        
                               then [(FIAT_EXTENSION, "__extension__")] 
        
                               else []) 
        
                                ++ [(FIAT_INLINE prefix, "__inline__")] 
        
                       in 
        
                       ["#if defined(__GNUC__) || defined(__clang__)"] 
        
                         ++ List.map (fun '(MACRO, primitive) => "#  define " ++ MACRO ++ " " ++ primitive)%string gnuc_defines 
        
                         ++ ["#else"] 
        
                         ++ List.map (fun '(MACRO, primitive) => "#  define " ++ MACRO)%string gnuc_defines 
        
                         ++ ["#endif"]) 
        
                   ++ (if IntSet.mem uint128 bitwidths_used || IntSet.mem int128 bitwidths_used 
        
                       then [""; 
        
                            FIAT_EXTENSION ++ " typedef signed __int128 " ++ int.type.to_string prefix int128 ++ ";"; 
        
                            FIAT_EXTENSION ++ " typedef unsigned __int128 " ++ int.type.to_string prefix uint128 ++ ";"]%string 
        
                       else []) 
        
                   ++ (if skip_typedefs 
        
                       then [] 
        
                       else List.flat_map 
        
                              (fun td_name => 
        
                                 match List.find (fun '(name, _, _, _) => (td_name =? name)%string) typedef_map with 
        
                                 | Some td_info => [""] ++ make_typedef prefix td_info 
        
                                 | None => ["#error ""Could not find typedef info for '" ++ td_name ++ "'"";"]%string 
        
                                 end%list) 
        
                              (typedefs_used infos)) 
        
                   ++ ["" 
        
                       ; "#if (-1 & 3) != 3" 
        
                       ; "#error ""This code only works on a two's complement system""" 
        
                       ; "#endif"] 
        
                   ++ (List.flat_map 
        
                         (value_barrier_func internal_static prefix) 
        
                         (IntSet.elements value_barrier_bitwidths)) 
        
                   ++ [""])%list.

to include

#include <intrin.h>
#if defined(_M_X64)
#include <immintrin.h>
#endif

and emits the relevant definitions for the primitives (presumably this is easiest to do if I add a new primitive ident for #if)

Hm, but it's not entirely clear how this should be structured. Maybe we should add support for each backend to describe a collection of "here's the string body of this function", to be used if the users request --primitives-via-intrinsics or something?

Also not wedded to the current the structure if something else be more convenient on the fiat side.

The main nuisance is that, when Andres and I tested it, generating MSVC-style code[*] for u128-capable targets produced dramatically worse code for Clang and GCC. It seems compilers are not very good at switching the u64 pairs back into a u128, and having them separate impedes some kind of analysis. :-(

Though it occurs to me we never tried teaching MSVC to fit the u128 APIs. That is, on MSVC, we could define u128 as a struct { uint64_t lo, hi; } and then implementing that with its intrinsics. Question is how good MSVC is at promoting structs to registers. Maybe something to try. Also it's a little subtle because the abstraction would need to capture that we never truly compute u128 x u128, only u64 x u64 -> u128.

[*] That is, we tried implementing the "intrinsics" using u128 and then splitting the u128s into pairs of u64s to use a single abstraction. Naively, one would think this works because a u128 needs to be split into u64s by the compiler anyway!

That is, on MSVC, we could define u128 as a struct { uint64_t lo, hi; } and then implementing that with its intrinsics.

This is what we do in libsecp256k1. See files https://github.com/bitcoin-core/secp256k1/blob/master/src/int128_struct_impl.h, https://github.com/bitcoin-core/secp256k1/blob/master/src/int128_struct.h and https://github.com/bitcoin-core/secp256k1/blob/master/src/int128.h.

This uint128 struct implementation was formally proven correct in VST by @roconnor-blockstream (at least when it was initially added), see bitcoin-core/secp256k1#1000 (comment). However, we don't have the proofs in our repo, and we don't check them continuously or adapt them to code changes. Perhaps @roconnor-blockstream maintains an up-to-date version?

Edit: Maybe adding code that uses a generic interface such as in https://github.com/bitcoin-core/secp256k1/blob/master/src/int128.h will be a good first step, even without including a (formally verified) implementation. As I understand, it would not really worsen the guarantees of fiat-crypto, because the conversion to strings is anyway not covered by the proofs. This would help us move forward with the integration in libsecp256k1.

Running fiat-crypto with --no-wide-int (like here) makes code without uint128. However, this code is slower than code that uses uint128 on some compilers. Further, the way to tease the compiler to get decently fast code seems to be a rabbithole, see here for an incomplete solution. Following that example, we could have a three-level system:

assembly for x86-64 with ADX,BMI,BMI2
uint128-based code as currently generated for compilers that support it
code with fiat-crypto --no-wide-int (uint128 inlined) for other compilers. This code would rely on a handwritten file defining operations on 64-bit integers in compiler- and architecture-specific ways
- using multi-return intrinsics such as *out1 = _umul128(arg1, arg2, out2) whenever available
- falling back to single-return intrinsics if needed *out2 = __umulh(arg1, arg2)
- falling back to polyfills implemented using uint64_t if absolutely necessary (WIP here).

I would be happy to assist with all required fiat-crypto adaptations and proofs of polyfills, but I would very much appreciate if somebody else did the job of hunting down appropriate incantations to get fast full addition, subtraction, multiplication, and cmov for 64-bit (and 32-bit?) numbers for different targets. I propose we'd then

check a huge file with ifdefs into fiat-crypto (or perhaps multiiple files ifdef-included would be better?)
have non-uint128-based implementations generated by fiat-crypto #include this file
copy-paste all three files to libsecp256k1 and other users of fiat-crypto-generated code

Here's a mashup of fiat-crypto secpk256k1 C code with uint128 only used to implement full 64-bit arithmetic: https://gist.github.com/andres-erbsen/7dd3ce006f7361840225f081bcdd9ad7 This should be a good starting point for experimentation with intrinsics and such.

That is, on MSVC, we could define u128 as a struct { uint64_t lo, hi; } and then implementing that with its intrinsics.

Ah that could also be worth trying out. We know that Clang and GCC generate worse code when MSVC's free-floating u64s are implemented in terms of u128, but we didn't try the converse. If u128 is implemented using a struct in just MSVC, presumably that won't impact GCC/Clang and hopefully MSVC can promote the struct to registers. Then we could do it in one file.

Indeed, polyfilling u128 sounds good as well. In case we do both, --no-wide-int would truly optional for templates that don't otherwise need intrinsics, whereas saturated arithmetic would still need intrinsics / other polyfills.

If u128 is implemented using a struct in just MSVC

Restricting this to MSVC is what we did, and it works rather well. __int128 is the "intrinsic" way on gcc/clang, so there's need to use something else for these compilers.

This should be a good starting point for experimentation with intrinsics and such.

At least the docs are good: https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170 I think we could help here in general, but I don't have the time right now.

Here's a mashup of fiat-crypto secpk256k1 C code with uint128 only used to implement full 64-bit arithmetic: gist.github.com/andres-erbsen/7dd3ce006f7361840225f081bcdd9ad7 This should be a good starting point for experimentation with intrinsics and such.

I think these are all relevant intrinsics on x64 MSVC for this code:

_addcarry_u64, or _addcarryx_u64 on ADX
_subborrow_u64
__umul_h with _umul128, or _mulx_u64 on BMI2

If the goal is polyfilling u128, these are interesting, too:

__shiftleft128,
__shiftright128

And maybe these are interesting in general, I don't know:

_shlx_u64 on BMI2
_shrx_u64 on BMI2

Indeed, polyfilling u128 sounds good as well.

Would it be feasible to generate a mashup of secp256k1 C that uses functions like u128_mul, u128_add instead of operators on the native type? Then we could experiment with implementing it, using intrinsics and polyfills. I think our existing u128 code is a good starting point. Currently, it has only the ops we need in our code. For example, it doesn't have addition because our code is written to use accum_mul only. But the missing ops should be easy to add.

Would it be feasible to generate a mashup of secp256k1 C that uses functions like u128_mul, u128_add instead of operators on the native type?

This should be pretty easy on the Fiat Crypto codegen side: add an option --explicit-wide-int-functions that enables an extra rewriting pass that rewrites double-width operations to Z.add_modulo, etc, and add these functions to the collection of primitive functions that get searched for / processed / printed.

One detai

Bah wrong button

One detail to keep in mind: the natural mul function for u128 is not u128 x u128 -> u128, but u64 x u64 -> u128. That's the operation you get from the CPU and what MSVC gives you intrinsics for. (Indeed we're currently relying on GCC and Clang to recognize the lower bound on the casted inputs to generate fast and constant-time code.)

Addition is a bit more flexible, but if there are a few variants where the inputs have tighter bounds, it might help the compiler along slightly, depending on how smart MSVC is. That's probably a place to iterate on.

Also, since u128 will be abstracted, the casts will also need to go through functions.

Hi,

This should be pretty easy on the Fiat Crypto codegen side: add an option --explicit-wide-int-functions that enables an extra rewriting pass that rewrites double-width operations to Z.add_modulo, etc, and add these functions to the collection of primitive functions that get searched for / processed / printed.

I'd just kindly wanted to bump this and ask what the status of the rewrite pass --explcit-wide-int-functions would be. Jason, did you want to take care of this?

I feel, if we want to advance on bitcoin-core/secp256k1#1319 (using Fiat-Crypto generated C code in Bitcoin-core), we would need this.

Thanks!

No, I haven't had a chance to work on this, and don't expect to have time soon. @andres-erbsen are you working on this?

No, and I don't understand your suggestion. What does Z.add_modulo have to do with it?

I sketched up a bedrock2-style version that passes uint128 by address at https://godbolt.org/z/qe5veE4dE and it seems that clang allocates a needless stack frame in that style. Passing uint128 by pointer (or probably by value) does not have that issue, but that is not straightforward to support in Bedrock2. Thus the choice we make here potentially has implications for the goal of replacing the unverified C backend with an (improved) Bedrock2 backend.

No, and I don't understand your suggestion. What does Z.add_modulo have to do with it?

Probably Z.add_modulo is wrong, and I should have written Z.add_carry_full or whatever

I am further leaning against designing fiat-crypto around uint128 after learning more about it. Yes, it's currently 10% faster than our next best alternative on clang, but it's also nonstandard without a stable ABI and would require legwork to support in a more rigorous Bedrock2-based C backend. @dderjoel would you be willing to benchmark and investigate how well the approach based on mulx-style intrinsics works for secp256k1?

Here's a mashup of fiat-crypto secpk256k1 C code with uint128 only used to implement full 64-bit arithmetic: https://gist.github.com/andres-erbsen/7dd3ce006f7361840225f081bcdd9ad7 This should be a good starting point for experimentation with intrinsics and such.

Hi,

This should be pretty easy on the Fiat Crypto codegen side: add an option --explicit-wide-int-functions that enables an extra rewriting pass that rewrites double-width operations to Z.add_modulo, etc, and add these functions to the collection of primitive functions that get searched for / processed / printed.

I'd just kindly wanted to bump this and ask what the status of the rewrite pass --explcit-wide-int-functions would be. Jason, did you want to take care of this?

I feel, if we want to advance on bitcoin-core/secp256k1#1319 (using Fiat-Crypto generated C code in Bitcoin-core), we would need this.

Thanks!

Sure, more than happy to benchmark on Monday. Which implementation should I then use for the #error implement mes? Just remove the macro?
And then against which one do we benchmark? The default Fiat version? (i.e. the one with uints128) I believe that would then be the "fiat_c" column in bitcoin-core/secp256k1#1261 (comment)

Here 's the workbench I am using to get to the bottom of the instruction-scheduling issue. It includes a grabbag of ifdefs for full addition and multiplication. _subborrow_u64 should be a thing if you need it.

Please use your judgement about what target and compiler to benchmark with; I imagine cases where the asm would be used instead may be less important.

Gday,
So If I understand correctly we want something like this. Which is essentially the code from your gist with the functions fiat_secp256k1_dettman_mulx_u64 and fiat_secp256k1_dettman_addcarryx_u64 filled from your workbench.
It seems to compile successfully on all the compilers you've had in your workbench

I've ran that now through my bench marking suite, the average over my ten machines are as below.
The interesting columns are probably fiat_c file here and fiat_c_narrow_int, file here.

Using gcc 12.1 with the default -O2 -std=c89

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	15.9553	14.8564	15.7973	15.0034	16.7601	14.3581
ecmult_const	30.134	28.0089	30.3232	28.1999	32.2136	26.4355
ecmult_1p	23.8541	21.8671	23.6126	22.0921	25.1787	20.6224
ecmult_0p_g	17.0685	15.6651	16.9493	15.7031	17.8215	14.7483
ecmult_1p_g	13.8681	12.7258	13.7497	12.8885	14.7225	12.1106
field_half	0.00254854	0.00250171	0.00246726	0.00246905	0.0024302	0.00249464
field_normalize	0.00743748	0.00739759	0.00735383	0.00729524	0.00731106	0.00743008
field_normalize_weak	0.00298519	0.00294793	0.00292643	0.00289695	0.00291171	0.00295628
field_sqr	0.014061	0.0125477	0.0137904	0.0121888	0.0155662	0.0120083
field_mul	0.0170919	0.0142766	0.0156943	0.0144284	0.0184914	0.0145763
field_inverse	1.40989	1.39391	1.39632	1.38617	1.38785	1.63071
field_inverse_var	0.916405	0.907523	0.909371	0.902676	0.905265	0.918417
field_is_square_var	1.20048	1.201	1.20416	1.18992	1.19752	1.2016
field_sqrt	3.87876	3.54522	3.83995	3.46558	4.22192	3.36333

with clang 15.0.7 and -O2 -std=c89

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	14.6136	14.8385	15.5622	14.4397	14.4943	13.3405
ecmult_const	28.9408	29.0521	30.6375	28.0475	28.5102	26.248
ecmult_1p	22.7747	22.5253	23.7695	21.6406	22.1033	20.7139
ecmult_0p_g	16.4172	16.1748	16.9559	15.5158	15.8491	14.7478
ecmult_1p_g	13.3338	13.1595	13.8111	12.7316	13.0205	12.1511
field_half	0.00240605	0.00242857	0.00233401	0.00234214	0.00236749	0.00241453
field_normalize	0.00700024	0.00692887	0.00695646	0.00697312	0.00697815	0.0069435
field_normalize_weak	0.0028239	0.00280764	0.00282323	0.00279338	0.00279837	0.00280257
field_sqr	0.0136569	0.0131805	0.0138059	0.0130322	0.0122454	0.0121549
field_mul	0.0169676	0.0171089	0.0166209	0.0158057	0.0154979	0.0142874
field_inverse	1.459	1.44117	1.43697	1.42892	1.43199	1.62015
field_inverse_var	0.961401	0.95807	0.956101	0.952315	0.956608	0.965142
field_is_square_var	1.26571	1.25726	1.25681	1.24756	1.25254	1.26814
field_sqrt	3.773	3.5024	3.77622	3.5255	3.4172	3.36683

with gcc 12.1 and -O2 -march=native -mtune=native

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	16.5911	16.0718	17.0091	16.1057	17.4993	15.1004
ecmult_const	30.9681	30.3632	32.0019	29.9378	33.1768	27.6134
ecmult_1p	24.3542	23.7422	24.999	23.1871	25.9921	21.3932
ecmult_0p_g	17.6799	17.0362	17.9861	16.6104	18.6029	15.2307
ecmult_1p_g	14.2199	13.8549	14.6071	13.575	15.106	12.4385
field_half	0.00267538	0.00245599	0.00245538	0.00243217	0.00240507	0.00247985
field_normalize	0.00809459	0.0079052	0.00798932	0.00796996	0.00790512	0.00809351
field_normalize_weak	0.00300635	0.00289828	0.0029111	0.00292142	0.00288426	0.00300672
field_sqr	0.013964	0.0140265	0.0154971	0.0135017	0.0169648	0.0122041
field_mul	0.0173665	0.0158165	0.0169365	0.0171269	0.0192244	0.0148267
field_inverse	1.47174	1.43223	1.43721	1.45215	1.42826	1.70683
field_inverse_var	0.928208	0.899495	0.901036	0.908532	0.898208	0.9305
field_is_square_var	1.2196	1.20073	1.19762	1.2069	1.19571	1.22918
field_sqrt	3.94505	3.93775	4.29111	3.79486	4.62069	3.44887

with clang 15.0.7 and -O2 -march=native -mtune=native
tbd (something, segfaults)

Thank you for the numbers! My immediate impression is that clang is overall the fastest here (unsurprisingly), and there using native uint128 saves <3% in that case and is slower in some unimportant cases. It's also not the first time that clang segfaults on fiat-crypto code.

I am ignoring the benchmarks of individual fiat-crypto generated functions as measuring small stuff meaningfully is hard and we have larger functions here. Then on field_inverse the difference is ~0.2%, suggesting that field square is pretty much in good shape. (I would have guessed squaring is the function where naive scheduling would do the best on given our code, but mul doesn't look that bad either, so I am hesitant to conclude anything here).

As for GCC, I don't know. It's clearly faster with uint128, but I have not investigated this and it's possible another implementation of mulx/adc would do better. I am also not that inclined to investigate given that non-cpu-targeting GCC and clang beat cpu-targeting GCC -- doing worse when given more assumptions is a bit silly to begin with.

All in all I am leaning towards saying that uint128 is not worth supporting.

Oh, it looks like clang trunk vectorizes something here so -fno-slp-vectorize might be worth a shot.

Oh, it looks like clang trunk vectorizes something here so -fno-slp-vectorize might be worth a shot.

Do you mean the switch could make clang not segfault?

I wasn't guessing that; just the vector-register usage in the generated code looks silly.

Do you have a repro for the clang segfault? I might be able to get a clang expert to look at it.

It is actually part of the cryptopt generated code. It does not segfault, when I do not specify the -march= flag.

git clone https://github.com/dderjoel/secp256k1
cd secp256k1
git checkout only-asm
./autogen.sh
CC=clang CFLAGS='-march=skylake' ./configure --with-asm=x86_64
make clean bench_ecmult
./bench_ecmult

It segfaults at the ret sandwiched in two vzeroupper:

   0x555555585c52 <secp256k1_ecmult_odd_multiples_table+5298>:  pop    r15
   0x555555585c54 <secp256k1_ecmult_odd_multiples_table+5300>:  pop    rbp
   0x555555585c55 <secp256k1_ecmult_odd_multiples_table+5301>:  vzeroupper
=> 0x555555585c58 <secp256k1_ecmult_odd_multiples_table+5304>:  ret
   0x555555585c59 <secp256k1_ecmult_odd_multiples_table+5305>:  vzeroupper

I believe it is because I use rbp in the asm, because of two reasons:

Using CC=gcc CFLAGS='-march=skylake -O3 -mtune=skylake' ./configure --with-asm=x86_64, it complains that I use bp in the assembly:

In file included from src/secp256k1.c:27,
                 from src/bench_ecmult.c:8:
src/field_impl.h: In function ‘secp256k1_fe_sqrt’:
src/field_impl.h:158:1: error: bp cannot be used in ‘asm’ here
  158 | }
      | ^
make: *** [Makefile:1214: src/bench_ecmult-bench_ecmult.o] Error 1

Using only gcc -O2 -march=skylake -mtune=skylake or gcc -O3 -mtune=skylake it neither complains nor segfaults.

I have seen earlier (cannot reproduce it anymore) that the generated code does not respect rbp being clobbered. That is here: In line 24 rbp is used for some value, but clang replaces q0 with some [rbp + 0x8] or so... (I'm not 100% sure if it was q0 or q1 or q2)

Oh, you mean our benchmark segfaults, not clang itself! That could be as well be a bug on our side...

Oh, you mean our benchmark segfaults, not clang itself! That could be as well be a bug on our side...

Just updated the comment. That also means that we should not merge the code at the moment. There is no tests in the CI using -march=skylake with clang, is there? Or a test with gcc -O3 -march=skylake ?

Yep I think that gcc warning is onto something. Perhaps the =&m temporaries get encoded as rbp-relative references, which are bogus when rbp is temporarily repurposed for arithmetic use? I imagine that'd be a requirement for the conflicting use case of rsp-modifying asm. Concretely, for example here, imagine %q0 being ~~-0x40(%%rsp)~~ -0x40(%%rbp).

Also does libsecp256k1 support building with -fno-omit-frame-pointer?

Concretely, for example here, imagine %q0 being -0x40(%%rsp).

that should be okay, no? it's wrong it its based off rbp e.g. %q0 being -0x40(%%rbp). Or am I misunderstanding?

Yep, I think this is llvm/llvm-project#58528.

When I compile with -march=skylake -O0 -g and run this in gdb, then command disas tells me that the crash is in

 => 0x000055555556104c <+156>:	mov    %rdi,-0x38(%rbp)

which corresponds to the line

 "mov    %%rdi,%q0\n"

in the source. (This line: https://github.com/dderjoel/secp256k1/blob/c010c156424d5c06f9b6d16a839e04bbea5b31c1/src/third_party/field_5x52_asm_impl.h#L35 )

Look at the "addresses" of the variables here:

Program received signal SIGSEGV, Segmentation fault.
0x000055555556104c in secp256k1_fe_mul_inner (r=<error reading variable: Cannot access memory at address 0xffffffffffffffb0>, 
    a=<error reading variable: Cannot access memory at address 0xffffffffffffffa8>, 
    b=<error reading variable: Cannot access memory at address 0xffffffffffffffa0>) at src/third_party/field_5x52_asm_impl.h:13

Indeed, it seems that using rbp for arithmetic is not possible here, even if one lists it as clobbered.

(Happy to move this discussion to the PR, it's a bit off-topic here.)

Okay, thanks for investigating. (extended answer moved to PR)

But let me quickly come back on topic of uint128's. Based on Andres'

All in all I am leaning towards saying that uint128 is not worth supporting.

I feel we want something like the mashup, as the default and if one wants, --wide-int to emit code with uint128s? Do we want me to run again to get more performance numbers, especially with clang -O3 -march=native (I'll fix my scripts to exclude the assembly for now then.)?

(Sorry for getting off topic again)
For the bitcoin-core/secp256k1#1319, I think we would then want code like the one in the current benchmark, but generated entirely from the fiat pipeline, right?

Yeah that sounds good, we can keep uint128 behind a flag in cases its already implemented, but off by default. The Bedrock2 backend never used uint128 outside its mulx function. In the glorious future where the Bedrock2-based backend becomes the default, we may consider dropping the wholly unverified C backend.

I would appreciate performance numbers, but no rush. I'm out for a much-needed vacation starting this weekend so probably I won't look at anything until two weeks from now.

If I was doing bitcoin-core/secp256k1#1319 I would rerun fiat-crypto with the command at the top of the benchmark file (here) and diff the output with the benchmark file, then flag any differences for manual review (I hope it's just the ifdefed functions). It's not like we can prove anything useful about intrinsics anyway.

All in all I am leaning towards saying that uint128 is not worth supporting.

Yeah that sounds good, we can keep uint128 behind a flag in cases its already implemented, but off by default.

Hm, I see. I can't speak for the other contributors, but I think this makes fiat-crypto the C output much less attractive for libsecp256k1. The reasons are:

We have a formally verified polyfill for uint128_t using a struct (for MSVC), and we use uint128 in other places in our code. (Namely, in the scalar field implementation. This could in principle also be replaced by fiat-crypto output in the long term, but that needs to be considered separately.)
We care about performance on GCC, and the benchmarks show that we easily lose > 10% for EC multiplication.

I think one possible way forward for us is to use fiat-crypto's C output (with uint128) and rewrite it manually to use our generic uint128 interface. Since the backend is not formally verified, this would not forego any formal guarantees. The result of the rewriting should be equivalent to our code. In fact, if it turns out to be equivalent, then we don't even need to replace our code. (That sounds a bit silly or even sad then, but it still would give us a lot of confidence that our code implements a correct algorithm.)

Thank you for the feedback. I am thinking whether there's something we can do along the lines of what you suggested but less manual. The Coq code that prints the C you would be rewriting is here and if only it had types there it could just be changed to print uint128 operations as function calls. For multiplication it would probably suffice to check whether one argument has a cast to uint128, and maybe a similar recursive check would work for uint128 add, but I am not sure. If this sounds like it would do the thing for you, I am happy to give it a try.

On what architectures do you care about performance with GCC?

Thank you for the feedback. I am thinking whether there's something we can do along the lines of what you suggested but less manual. The Coq code that prints the C you would be rewriting is here and if only it had types there it could just be changed to print uint128 operations as function calls. For multiplication it would probably suffice to check whether one argument has a cast to uint128, and maybe a similar recursive check would work for uint128 add, but I am not sure. If this sounds like it would do the thing for you, I am happy to give it a try.

Okay, yes, this is what I brougt up earlier, and this would, of course, be optimal for us.

On what architectures do you care about performance with GCC?

This is probably not a very focused answer, but I think the story here is that aim for decent performance on GCC and clang. Official Bitcoin Core builds use GCC for Linux and Windows, and clang for macOS, which is something we keep in mind in libsecp256k1, but we really try to offer a portable library that works well more or less everywhere, and don't want to force our users to use a specific compiler. Architectures where we really care about performance are x86_64 and maybe ARM64. Hardware wallets are also interesting, but the landscape there is a bit more complicated (some are ARM32 for example.)

Ok I started a prototype to print generic uint128 code at andres-erbsen@3626fec and kicked off a compile. This exact change is not enough, but if it does the right thing for the handled case, more of the same should be straightforward. I forget what the expected compilation time for secp256k1 in fiat-crypto is, but seconds wasn't enough. (The point in comment you link to still misses me, but alas, I think I now understand what you would like to happen here.)

The performance answer is along the lines of what I was hoping for, thanks. Just to clarify: is performance with GCC on x86-64 important or not given that there is also an assembly implementation?

I'm back with more numbers. I've fixed the segfault in fiat_cryptopt and could run all the benchmarks across my machines again.
I'll be comparing here {gcc,clang} {-O3,-O2} -march=native -mtune=native; and the default compilations {gcc,clang}, which should result in -O2, with a more portable binary.
Spoiler: With clang, fiat_c_narrow_int is actually a tad faster than the current c implementation.

clang (default)

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	14.4	14.8685	15.509	14.4466	14.2407	13.1947
ecmult_const	28.6379	28.9282	30.4886	28.0935	28.1623	26.1167
ecmult_1p	22.6069	22.4785	23.6166	21.6802	22.1214	20.6979
ecmult_0p_g	16.4805	16.2687	17.058	15.6537	15.9589	15.0235
ecmult_1p_g	13.2241	13.1847	13.802	12.7043	12.893	12.2081
field_half	0.00245578	0.0023995	0.00228864	0.00237679	0.0024416	0.00238009
field_normalize	0.00694667	0.006916	0.00698636	0.00692382	0.0069511	0.00692881
field_normalize_weak	0.0028189	0.0028261	0.00281602	0.00278509	0.00279863	0.00279448
field_sqr	0.0135209	0.0127022	0.0138531	0.0128885	0.0122077	0.0120565
field_mul	0.0168353	0.0164905	0.0167452	0.0157152	0.0155041	0.0144297
field_inverse	1.43893	1.42658	1.43585	1.42449	1.43356	1.61813
field_inverse_var	0.95799	0.951122	0.952589	0.949808	0.953455	0.960287
field_is_square_var	1.25882	1.25678	1.25474	1.24586	1.25356	1.26345
field_sqrt	3.73555	3.4951	3.78312	3.51957	3.40714	3.33102

clang -O2 -march=native -mtune=native

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	15.635	15.5033	16.0575	15.0982	14.5795	14.0249
ecmult_const	29.3735	28.4725	29.9029	27.3628	27.5755	26.5836
ecmult_1p	23.3621	22.3728	23.4829	21.4641	22.0046	21.3766
ecmult_0p_g	17.0755	16.1995	16.9391	15.384	16.6517	15.6403
ecmult_1p_g	13.6394	13.0701	13.6493	12.6048	12.8295	12.477
field_half	0.00518533	0.00509934	0.00501634	0.00509551	0.00509858	0.00517301
field_normalize	0.00478045	0.004775	0.00477195	0.00474356	0.00473628	0.00472487
field_normalize_weak	0.00283434	0.00282166	0.00284423	0.00280772	0.00283141	0.00282065
field_sqr	0.0136087	0.0142888	0.0147477	0.0134873	0.0117804	0.0120515
field_mul	0.0169551	0.0169624	0.0176455	0.0165582	0.0147279	0.0144852
field_inverse	1.55314	1.52646	1.51292	1.50915	1.51043	1.64765
field_inverse_var	0.923946	0.922127	0.923942	0.916541	0.916457	0.927074
field_is_square_var	1.21189	1.22217	1.21449	1.21058	1.20822	1.22949
field_sqrt	3.81432	3.85748	4.02828	3.72946	3.3191	3.35218

clang -O3 -march=native -mtune=native

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	15.5047	15.2393	15.731	14.99	14.5696	14.1213
ecmult_const	29.3673	28.0945	29.2459	27.2205	27.6556	26.6799
ecmult_1p	23.3116	22.1069	23.0035	21.4319	22.085	21.3718
ecmult_0p_g	17.012	15.8731	16.55	15.304	16.2868	15.6925
ecmult_1p_g	13.6059	13.0403	13.5035	12.587	12.8417	12.4987
field_half	0.00522527	0.00511302	0.00503948	0.00498669	0.00505752	0.00513952
field_normalize	0.00475579	0.00475721	0.00475133	0.00471258	0.00473416	0.00479262
field_normalize_weak	0.00282931	0.00284143	0.00280667	0.00279738	0.00282548	0.0028323
field_sqr	0.0136073	0.014192	0.0146883	0.0134972	0.01179	0.012144
field_mul	0.0169033	0.0169013	0.0177466	0.0165226	0.0146773	0.0144688
field_inverse	1.51547	1.52616	1.51278	1.50301	1.50377	1.65712
field_inverse_var	0.917597	0.920323	0.912464	0.910969	0.915148	0.932548
field_is_square_var	1.21427	1.21174	1.20388	1.19847	1.19944	1.23266
field_sqrt	3.82321	3.10106	3.30623	3.68093	3.30375	3.39914

gcc (default)

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	15.8998	15.0468	15.8134	15.0307	16.8996	14.7032
ecmult_const	30.1216	28.1942	30.3036	28.2912	32.3142	27.2834
ecmult_1p	23.8669	21.9609	23.6295	22.2062	25.2611	21.267
ecmult_0p_g	17.0692	15.8632	16.9321	15.8502	17.8889	15.1963
ecmult_1p_g	13.9171	12.8349	13.7314	12.8807	14.7508	12.4131
field_half	0.00260147	0.00239082	0.00246679	0.0025392	0.00246015	0.00242672
field_normalize	0.00745444	0.00746962	0.00734761	0.00730362	0.00734994	0.00747002
field_normalize_weak	0.00297391	0.0029576	0.0029316	0.00292127	0.00290907	0.00294705
field_sqr	0.0140496	0.0126285	0.0137609	0.0124719	0.0155721	0.0122619
field_mul	0.0170276	0.0143555	0.0156142	0.0146069	0.0185516	0.0147299
field_inverse	1.40303	1.40325	1.39395	1.38897	1.38745	1.62885
field_inverse_var	0.925005	0.917206	0.911152	0.905561	0.908435	0.918524
field_is_square_var	1.2015	1.20861	1.19965	1.19366	1.19798	1.20488
field_sqrt	3.87766	3.57096	3.83902	3.44814	4.24594	3.42435

gcc -O2 -march=native -mtune=native

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	16.721	16.2147	17.1447	16.1082	17.4143	15.2807
ecmult_const	31.2689	30.1828	32.2569	29.9705	32.9838	27.977
ecmult_1p	24.591	23.4126	25.1881	23.2472	25.6367	21.8179
ecmult_0p_g	17.4537	16.817	17.8592	16.6341	18.4228	15.6129
ecmult_1p_g	14.3136	13.6362	14.5647	13.5935	15.0641	12.8132
field_half	0.00264791	0.00252462	0.00238327	0.00236645	0.00232041	0.00241018
field_normalize	0.00810289	0.00792896	0.00805239	0.00794354	0.00782179	0.00812656
field_normalize_weak	0.00299978	0.00292424	0.00292019	0.00294033	0.00288756	0.00299396
field_sqr	0.0142147	0.0139958	0.0154915	0.013651	0.0169165	0.0122637
field_mul	0.0173362	0.0159168	0.0171377	0.016962	0.0190466	0.0147364
field_inverse	1.47347	1.44537	1.447	1.44393	1.42127	1.69394
field_inverse_var	0.929955	0.905406	0.90717	0.906503	0.893355	0.930748
field_is_square_var	1.2179	1.20598	1.20458	1.20179	1.18497	1.21568
field_sqrt	3.94708	3.95934	4.33316	3.78405	4.60921	3.468

gcc -O3 -march=native -mtune=native

implementation	default_asm	default_c	default_c52	fiat_c	fiat_c_narrow_int	fiat_cryptopt
ecmult_gen	16.7701	16.1389	16.8432	16.0805	17.706	15.2464
ecmult_const	30.9871	29.0426	31.3515	29.4948	32.6941	27.7546
ecmult_1p	24.7098	23.1923	24.5407	23.0531	25.5556	21.5714
ecmult_0p_g	17.757	16.7931	17.6227	16.5752	18.3798	15.6301
ecmult_1p_g	14.3999	13.6311	14.2793	13.4965	14.913	12.5407
field_half	0.00255814	0.00250539	0.00235872	0.00242583	0.00232508	0.00246923
field_normalize	0.00709768	0.00690524	0.00689225	0.00685556	0.0068617	0.00708038
field_normalize_weak	0.00301215	0.0029099	0.00291935	0.00287837	0.00288088	0.00299388
field_sqr	0.0140762	0.012191	0.0142837	0.0168347	0.0147925	0.0123209
field_mul	0.017737	0.0141453	0.0154513	0.0162763	0.0193284	0.0148816
field_inverse	1.45047	1.44391	1.44917	1.41797	1.41224	1.66926
field_inverse_var	0.896581	0.876363	0.873395	0.863774	0.866151	0.906099
field_is_square_var	1.25105	1.2159	1.21284	1.19329	1.20306	1.24462
field_sqrt	3.91907	3.51509	3.791	4.35562	4.15031	3.43451

Just to clarify: is performance with GCC on x86-64 important or not given that there is also an assembly implementation?

Good point. Yeah, if you ask me, I think it's reasonable to care a bit less about the performance of the C code on x86-64 given that we'll have assembly there.

Ok I started a prototype to print generic uint128 code at andres-erbsen@3626fec and kicked off a compile. This exact change is not enough, but if it does the right thing for the handled case, more of the same should be straightforward.

Yep, looks great. As far as I can understand the diff, this is what I had in mind.

Hi everyone, just a quick follow up question on what the status is here.
If I recall correctly, Andres should be back, and he has created some changes to print uint128 code. What was the result of that? And how would we proceed? Can I help with anything? (I'll have disruption in checking on my arsenal of machines mid September, If we'd want to have some benchmarks for that code)

I am back and I should work on it more. That build never finished, so it'll be a fun debugging session.

master...andres-erbsen:fiat-crypto:uint128-extern generates C code whose compilation only errors on implicit declaration of function 'u128_mul_u64_u64' and so on. No proofs, no testing. I wouldn't be too surprised if it works anyway.

Alright, cool. If I remember correctly, we wanted to use this implementation for field mul in combination with the already verified primitives (u128 \gets u64*u64, etc), right?

I'm trying to incorporate that, and then see what works and what does not.
The first issue I see is that bitcoin's mul is secp256k1_u128_mul(secp256k1_u128* r, uint64_t a, uint64_t b), and in Andres' version it is fiat_secp256k1_dettman_uint128 r = u64_mul_u64_u64(uint64_t a, uint64_t b); Could we change that to match the bitcoin's one?

I thought I could for now just use search and replace to get the result symbol in there as the first argument, but that is a bit too tricky with all those implicit values in eg. u128_add_u128_u128(&x4, u128_add_u128_u128(u128_mul_u64_u64((arg1[0]), (arg2[3])), u128_add_u128_u128(u128_mul_u64_u64((arg1[1]), (arg2[2])), u128_add_u128_u128(u128_mul_u64_u64((arg1[2]), (arg2[1])), u128_mul_u64_u64((arg1[3]), (arg2[0]))))), u128_mul_u64_u64(x3, UINT64_C(0x1000003d10)));

Then I see that in Bitcoin's version we only have the rshift as an in-place function. Is there any uint128 copy primitive?
(I could think of getting hi and low with
uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a);
uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a);
and then set a new u128 with
void secp256k1_u128_load(secp256k1_uint128 *r, uint64_t hi, uint64_t lo);
but that seems a bit verbose, and I don't know what the compiler will do.
)

Also I do not see an u64_and_u128_u64(uint64_t *r, u128 a, u64 b).
would

void u64_and_u128_u64(uint64_t *r, const secp256k1_uint128 *a, u64 b){
    uint64_t t1 = secp256k1_u128_to_u64(a);
    *r = t1 & b;
}

work?

Then I see that in Bitcoin's version we only have the rshift as an in-place function. Is there any uint128 copy primitive?

You can use the assignment operator to copy:

secp256k1_uint128 copy = orig;

Also I do not see an u64_and_u128_u64(uint64_t *r, u128 a, u64 b). would

void u64_and_u128_u64(uint64_t *r, const secp256k1_uint128 *a, u64 b){
    uint64_t t1 = secp256k1_u128_to_u64(a);
    *r = t1 & b;
}

work?

looks correct to me

How about the following?

secp256k1_u128 u128_mul_u64_u64(uint64_t a, uint64_t b) {
  secp256k1_u128 r;
  secp256k1_u128_mul(&r, a, b);
  return r;
}

AFAICT, libsecp256k1 never returns structures from functions.

I'm not entirely sure if this is deliberate or not, but I can add, FWIW, that VST is unable to reason about functions that return structures or take structures as arguments by copy or copying structures by assignment, none of which happen in libsecp256k1 that I am aware of.

VST being https://vst.cs.princeton.edu/, the tool I've been using to reason about C code.

My (fallible) recollection is that the VST limitation originates from a similar limitation CompCert used to have. Given how simple these wrappers are, I think it is fine to not prove anything about them for now. Is there another reason not to pass structs by value?

The derailing of VST would be more than just the wrappers, Any function that calls these wrappers also couldn't be reasoned about.

That said, libsecp256k1 didn't conform to this restriction for the sake of VST; it was like that before I even started. Nor do I necessarily expect that libsecp256k1 maintain these restrictions just for the sake of VST, though it might be worth being at least aware of.

Yeah, I don't think you'd want to use that wrapper anywhere outside fiat-crypto-generated code. Generating struct-by-pointer from the fiat-crypto C backend seems like it could be a rabbithole, so I'd rather not look into that unless there is a specific reason. As discussed above, we already have uint64s-by-pointer code and now the struct-by-value branch here. Adding struct-by-pointer support would be easier (and have proofs) in the bedrock2-based backend, but that's not mature enough yet for me to want to propose its use here.

okay,
I've got some polyfills

inline secp256k1_uint128 u128_mul_u64_u64(uint64_t a, uint64_t b) {
  secp256k1_uint128 r;
  secp256k1_u128_mul(&r, a, b);
  return r;
}
inline uint64_t u64_shr_u128(secp256k1_uint128 a, unsigned int n) {
  secp256k1_uint128 r = a;
  secp256k1_u128_rshift(&r, n);
  return secp256k1_u128_to_u64(&r);
}

inline uint64_t u64_and_u128_u64(secp256k1_uint128 a, uint64_t b) {
  return secp256k1_u128_to_u64(&a) & b;
}

inline secp256k1_uint128 u128_add_u64_u128(uint64_t a, secp256k1_uint128 b) {
  secp256k1_uint128 r = b;
  secp256k1_u128_accum_u64(&r, a);
  return r;
}

inline secp256k1_uint128 u128_add_u128_u128(secp256k1_uint128 a, secp256k1_uint128 b) {
  secp256k1_uint128 r = a;
  // adding low b
  secp256k1_u128_accum_u64(&r, b);

  // adding high b
  // ???

  return r;
}

But I am missing something to add two u128: u128_add_u128_u128. I can only see the secp256k1_u128_accum_u64 to add the low 64 bits. Am I missing something ?

u128_add_u128_u128 is never preformed in the current libsecp256k1 implementation.

Edit: I stand corrected 👇

Here's how libsecp256k1 adds uint128

I've done it in https://github.com/dderjoel/secp256k1/blob/only-c/src/field_5x52_int128_impl.h#L60-L82

And updated the PR bitcoin-core/secp256k1#1319
Even all the test run smoothly.

	Definition header
	{language_naming_conventions : language_naming_conventions_opt}
	{documentation_options : documentation_options_opt}
	{package_namev : package_name_opt}
	{class_namev : class_name_opt}
	{output_options : output_options_opt}
	(machine_wordsize : Z) (internal_static : bool) (static : bool) (prefix : string) (infos : ident_infos)
	(typedef_map : list typedef_info)
	: list string
	:= let bitwidths_used := bitwidths_used infos in
	let value_barrier_bitwidths := value_barrier_bitwidths infos in
	let FIAT_EXTENSION := (String.to_upper prefix ++ "FIAT_EXTENSION")%string in
	(["";
	"#include <stdint.h>"]
	++ (if IntSet.mem _Bool bitwidths_used \|\| IntSet.mem (int.signed_counterpart_of _Bool) bitwidths_used
	then ["typedef unsigned char " ++ int.type.to_string prefix _Bool ++ ";";
	"typedef signed char " ++ int.type.to_string prefix (int.signed_counterpart_of _Bool) ++ ";"]%string
	else [])
	++ (let gnuc_defines
	:= (if IntSet.mem uint128 bitwidths_used \|\| IntSet.mem int128 bitwidths_used
	then [(FIAT_EXTENSION, "__extension__")]
	else [])
	++ [(FIAT_INLINE prefix, "__inline__")]
	in
	["#if defined(__GNUC__) \|\| defined(__clang__)"]
	++ List.map (fun '(MACRO, primitive) => "# define " ++ MACRO ++ " " ++ primitive)%string gnuc_defines
	++ ["#else"]
	++ List.map (fun '(MACRO, primitive) => "# define " ++ MACRO)%string gnuc_defines
	++ ["#endif"])
	++ (if IntSet.mem uint128 bitwidths_used \|\| IntSet.mem int128 bitwidths_used
	then ["";
	FIAT_EXTENSION ++ " typedef signed __int128 " ++ int.type.to_string prefix int128 ++ ";";
	FIAT_EXTENSION ++ " typedef unsigned __int128 " ++ int.type.to_string prefix uint128 ++ ";"]%string
	else [])
	++ (if skip_typedefs
	then []
	else List.flat_map
	(fun td_name =>
	match List.find (fun '(name, _, _, _) => (td_name =? name)%string) typedef_map with
	\| Some td_info => [""] ++ make_typedef prefix td_info
	\| None => ["#error ""Could not find typedef info for '" ++ td_name ++ "'"";"]%string
	end%list)
	(typedefs_used infos))
	++ [""
	; "#if (-1 & 3) != 3"
	; "#error ""This code only works on a two's complement system"""
	; "#endif"]
	++ (List.flat_map
	(value_barrier_func internal_static prefix)
	(IntSet.elements value_barrier_bitwidths))
	++ [""])%list.