samkaufman/morello

Constants in generated code

Closed this issue · 3 comments

As I was working on the benchmarking code generation feature, I realized that running main.rs with matmul 2 produced 2 and 4 as magic numbers in the generated code, which might be confusing. To improve the readability and maintainability of the codegen logic, I believe it would be beneficial to define a constant for the size as the code snippet below.

@samkaufman What do you think?

No Constants:

#include <inttypes.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <arm_neon.h>

struct timespec ts_diff(struct timespec start, struct timespec end)
{
    struct timespec temp;
    if ((end.tv_nsec - start.tv_nsec) < 0)
    {
        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
    }
    else
    {
        temp.tv_sec = end.tv_sec - start.tv_sec;
        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
    }
    return temp;
}

__attribute__((noinline))
void kernel(
        uint32_t *restrict aa,
        uint32_t *restrict ab,
        uint32_t *restrict ac
) {
    uint32_t ad[4] __attribute__((aligned (128)));
    memset((void *)(&ad[0]), 0, 16);
    uint32_t ae[4] __attribute__((aligned (128)));
    for (int af = 0; af < 2; af++) {
        for (int ag = 0; ag < 2; ag++) {
            ae[af + 2 * ag] = ab[af + 2 * ag];
        }
    }
    uint32_t ah[4] __attribute__((aligned (128)));
    for (int ai = 0; ai < 2; ai++) {
        for (int aj = 0; aj < 2; aj++) {
            ah[ai + 2 * aj] = aa[ai + 2 * aj];
        }
    }
    for (int ak = 0; ak < 2; ak++) {
        for (int al = 0; al < 2; al++) {
            for (int am = 0; am < 2; am++) {
                ad[al + 2 * am] += ah[ak + 2 * am] * ae[2 * ak + al];  /* Mult */
            }
        }
    }
    for (int an = 0; an < 2; an++) {
        for (int ao = 0; ao < 2; ao++) {
            ac[an + 2 * ao] = ad[an + 2 * ao];
        }
    }
}

int main() {
    uint32_t *restrict ap;
    posix_memalign((void **)&ap, 128, 4*sizeof(uint32_t));
    for (size_t idx = 0; idx < 4; idx++) {
        ap[idx] = (uint32_t)rand();
    }
    uint32_t *restrict aq;
    posix_memalign((void **)&aq, 128, 4*sizeof(uint32_t));
    for (size_t idx = 0; idx < 4; idx++) {
        aq[idx] = (uint32_t)rand();
    }
    uint32_t *restrict ar;
    posix_memalign((void **)&ar, 128, 4*sizeof(uint32_t));
    for (size_t idx = 0; idx < 4; idx++) {
        ar[idx] = (uint32_t)rand();
    }

    // Inlined kernel follows. This is for warm-up.
    kernel(ap + 0, aq + 0, ar + 0);

    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);
#pragma clang loop unroll(disable)
    for (unsigned long bench_itr = 0; bench_itr < 10; ++bench_itr) {
        kernel(ap + 0, aq + 0, ar + 0);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    struct timespec delta = ts_diff(start, end);
    printf("cpu: %llds %lldns\n", (long long)delta.tv_sec, (long long)delta.tv_nsec);

    free(ap);
    free(aq);
    free(ar);

    return 0;
}

With Constants:

#include <inttypes.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <arm_neon.h>

#define SIZE 2

struct timespec ts_diff(struct timespec start, struct timespec end)
{
    struct timespec temp;
    if ((end.tv_nsec - start.tv_nsec) < 0)
    {
        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
    }
    else
    {
        temp.tv_sec = end.tv_sec - start.tv_sec;
        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
    }
    return temp;
}

__attribute__((noinline))
void kernel(
        uint32_t *restrict aa,
        uint32_t *restrict ab,
        uint32_t *restrict ac
) {
    uint32_t ad[SIZE*SIZE] __attribute__((aligned (128)));
    memset((void *)(&ad[0]), 0, 16);
    uint32_t ae[SIZE*SIZE] __attribute__((aligned (128)));
    for (int af = 0; af < SIZE; af++) {
        for (int ag = 0; ag < SIZE; ag++) {
            ae[af + SIZE * ag] = ab[af + SIZE * ag];
        }
    }
    uint32_t ah[SIZE*SIZE] __attribute__((aligned (128)));
    for (int ai = 0; ai < SIZE; ai++) {
        for (int aj = 0; aj < SIZE; aj++) {
            ah[ai + SIZE * aj] = aa[ai + SIZE * aj];
        }
    }
    for (int ak = 0; ak < SIZE; ak++) {
        for (int al = 0; al < SIZE; al++) {
            for (int am = 0; am < SIZE; am++) {
                ad[al + SIZE * am] += ah[ak + SIZE * am] * ae[SIZE * ak + al];  /* Mult */
            }
        }
    }
    for (int an = 0; an < SIZE; an++) {
        for (int ao = 0; ao < SIZE; ao++) {
            ac[an + SIZE * ao] = ad[an + SIZE * ao];
        }
    }
}

int main() {
    uint32_t *restrict ap;
    posix_memalign((void **)&ap, 128, SIZE*SIZE*sizeof(uint32_t));
    for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
        ap[idx] = (uint32_t)rand();
    }
    uint32_t *restrict aq;
    posix_memalign((void **)&aq, 128, SIZE*SIZE*sizeof(uint32_t));
    for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
        aq[idx] = (uint32_t)rand();
    }
    uint32_t *restrict ar;
    posix_memalign((void **)&ar, 128, SIZE*SIZE*sizeof(uint32_t));
    for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
        ar[idx] = (uint32_t)rand();
    }

    // Inlined kernel follows. This is for warm-up.
    kernel(ap + 0, aq + 0, ar + 0);

    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);
#pragma clang loop unroll(disable)
    for (unsigned long bench_itr = 0; bench_itr < 10; ++bench_itr) {
        kernel(ap + 0, aq + 0, ar + 0);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    struct timespec delta = ts_diff(start, end);
    printf("cpu: %llds %lldns\n", (long long)delta.tv_sec, (long long)delta.tv_nsec);

    free(ap);
    free(aq);
    free(ar);

    return 0;
}

How would you generate constants for all dimensions in, say, a composition of two convolutional layers? How would you name them?

How would you generate constants for all dimensions in, say, a composition of two convolutional layers? How would you name them?

@samkaufman I apologize if my suggestion was not helpful. Is it feasible to only produce reasonable constants (e.g., what I showed in my previous example)? I am still studying your code and there are many that I do not comprehend yet. This issue pertains to readability and maintainability, so it is not necessary to prioritize it. I also wanted to inquire about the possibility of this issue since I am unsure how to implement my suggestion. If you believe it is not significant, please feel free to close this issue. I am sorry for any inconvenience.

Closing as stale.