ashvardanian/less_slow.cpp

Inline Assembly version of a Pipeline

ashvardanian opened this issue · 0 comments

We can rewrite the whole "pipeline" as a single C 99 version:

void pipeline_kernel_c99(unsigned long start, unsigned long end, unsigned long *sum_out, unsigned long *count_out) {
    unsigned long sum = 0;
    unsigned long count = 0;
    unsigned long max_power_of_three = 12157665459056928801ull;

    for (unsigned long value = start; value <= end; ++value) {
        // Check if the value is a power of two
        if (value > 0 && (value & (value - 1)) == 0) continue;
        // Check if the value is a power of three
        if (value > 0 && max_power_of_three % value == 0) continue;

        // Prime factorization
        unsigned long n = value;
        unsigned long factor = 2;

        // Handle factor 2 separately
        while (n % 2 == 0) {
            sum += 2;
            count += 1;
            n /= 2;
        }

        // Handle odd factors
        factor = 3;
        while (factor * factor <= n) {
            while (n % factor == 0) {
                sum += factor;
                count += 1;
                n /= factor;
            }
            factor += 2;
        }

        // If n is still greater than 1, it is a prime factor
        if (n > 1) {
            sum += n;
            count += 1;
        }
    }

    *sum_out = sum;
    *count_out = count;
}

GCC 14.2 produces the following Assembly:

pipeline_kernel_c99(unsigned long, unsigned long, unsigned long*, unsigned long*):
        push    rbp
        mov     rbp, rcx
        push    rbx
        mov     rbx, rdx
        cmp     rsi, rdi
        jb      .L15
        mov     r9, rdi
        mov     r10, rsi
        xor     edi, edi
        xor     r8d, r8d
        movabs  r11, -6289078614652622815
        jmp     .L14
.L7:
        add     r9, 1
        cmp     r10, r9
        jb      .L2
.L14:
        xor     ecx, ecx
        test    r9, r9
        je      .L9
        lea     rax, [r9-1]
        test    rax, r9
        je      .L7
        mov     rax, r11
        xor     edx, edx
        div     r9
        test    rdx, rdx
        je      .L7
        mov     rcx, r9
        test    r9b, 1
        jne     .L39
.L9:
        shr     rcx
        add     r8, 2
        add     rdi, 1
        test    cl, 1
        je      .L9
        cmp     rcx, 8
        jbe     .L10
.L5:
        mov     esi, 3
.L11:
        mov     rax, rcx
        xor     edx, edx
        div     rsi
        test    rdx, rdx
        jne     .L13
.L12:
        mov     rax, rcx
        xor     edx, edx
        add     r8, rsi
        add     rdi, 1
        div     rsi
        xor     edx, edx
        mov     rcx, rax
        div     rsi
        test    rdx, rdx
        je      .L12
.L13:
        add     rsi, 2
        mov     rax, rsi
        imul    rax, rsi
        cmp     rcx, rax
        jnb     .L11
.L10:
        cmp     rcx, 1
        je      .L7
.L6:
        add     r9, 1
        add     r8, rcx
        add     rdi, 1
        cmp     r10, r9
        jnb     .L14
.L2:
        mov     QWORD PTR [rbx], r8
        mov     QWORD PTR [rbp+0], rdi
        pop     rbx
        pop     rbp
        ret
.L39:
        cmp     r9, 8
        ja      .L5
        jmp     .L6
.L15:
        xor     edi, edi
        xor     r8d, r8d
        mov     QWORD PTR [rbx], r8
        mov     QWORD PTR [rbp+0], rdi
        pop     rbx
        pop     rbp
        ret

It seems like this can be further optimized. Who's up for the task?