Constants in generated code
Closed this issue · 3 comments
As I was working on the benchmarking code generation feature, I realized that running main.rs
with matmul 2
produced 2
and 4
as magic numbers in the generated code, which might be confusing. To improve the readability and maintainability of the codegen logic, I believe it would be beneficial to define a constant for the size as the code snippet below.
@samkaufman What do you think?
No Constants:
#include <inttypes.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <arm_neon.h>
struct timespec ts_diff(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0)
{
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
__attribute__((noinline))
void kernel(
uint32_t *restrict aa,
uint32_t *restrict ab,
uint32_t *restrict ac
) {
uint32_t ad[4] __attribute__((aligned (128)));
memset((void *)(&ad[0]), 0, 16);
uint32_t ae[4] __attribute__((aligned (128)));
for (int af = 0; af < 2; af++) {
for (int ag = 0; ag < 2; ag++) {
ae[af + 2 * ag] = ab[af + 2 * ag];
}
}
uint32_t ah[4] __attribute__((aligned (128)));
for (int ai = 0; ai < 2; ai++) {
for (int aj = 0; aj < 2; aj++) {
ah[ai + 2 * aj] = aa[ai + 2 * aj];
}
}
for (int ak = 0; ak < 2; ak++) {
for (int al = 0; al < 2; al++) {
for (int am = 0; am < 2; am++) {
ad[al + 2 * am] += ah[ak + 2 * am] * ae[2 * ak + al]; /* Mult */
}
}
}
for (int an = 0; an < 2; an++) {
for (int ao = 0; ao < 2; ao++) {
ac[an + 2 * ao] = ad[an + 2 * ao];
}
}
}
int main() {
uint32_t *restrict ap;
posix_memalign((void **)&ap, 128, 4*sizeof(uint32_t));
for (size_t idx = 0; idx < 4; idx++) {
ap[idx] = (uint32_t)rand();
}
uint32_t *restrict aq;
posix_memalign((void **)&aq, 128, 4*sizeof(uint32_t));
for (size_t idx = 0; idx < 4; idx++) {
aq[idx] = (uint32_t)rand();
}
uint32_t *restrict ar;
posix_memalign((void **)&ar, 128, 4*sizeof(uint32_t));
for (size_t idx = 0; idx < 4; idx++) {
ar[idx] = (uint32_t)rand();
}
// Inlined kernel follows. This is for warm-up.
kernel(ap + 0, aq + 0, ar + 0);
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
#pragma clang loop unroll(disable)
for (unsigned long bench_itr = 0; bench_itr < 10; ++bench_itr) {
kernel(ap + 0, aq + 0, ar + 0);
}
clock_gettime(CLOCK_MONOTONIC, &end);
struct timespec delta = ts_diff(start, end);
printf("cpu: %llds %lldns\n", (long long)delta.tv_sec, (long long)delta.tv_nsec);
free(ap);
free(aq);
free(ar);
return 0;
}
With Constants:
#include <inttypes.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <arm_neon.h>
#define SIZE 2
struct timespec ts_diff(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0)
{
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
__attribute__((noinline))
void kernel(
uint32_t *restrict aa,
uint32_t *restrict ab,
uint32_t *restrict ac
) {
uint32_t ad[SIZE*SIZE] __attribute__((aligned (128)));
memset((void *)(&ad[0]), 0, 16);
uint32_t ae[SIZE*SIZE] __attribute__((aligned (128)));
for (int af = 0; af < SIZE; af++) {
for (int ag = 0; ag < SIZE; ag++) {
ae[af + SIZE * ag] = ab[af + SIZE * ag];
}
}
uint32_t ah[SIZE*SIZE] __attribute__((aligned (128)));
for (int ai = 0; ai < SIZE; ai++) {
for (int aj = 0; aj < SIZE; aj++) {
ah[ai + SIZE * aj] = aa[ai + SIZE * aj];
}
}
for (int ak = 0; ak < SIZE; ak++) {
for (int al = 0; al < SIZE; al++) {
for (int am = 0; am < SIZE; am++) {
ad[al + SIZE * am] += ah[ak + SIZE * am] * ae[SIZE * ak + al]; /* Mult */
}
}
}
for (int an = 0; an < SIZE; an++) {
for (int ao = 0; ao < SIZE; ao++) {
ac[an + SIZE * ao] = ad[an + SIZE * ao];
}
}
}
int main() {
uint32_t *restrict ap;
posix_memalign((void **)&ap, 128, SIZE*SIZE*sizeof(uint32_t));
for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
ap[idx] = (uint32_t)rand();
}
uint32_t *restrict aq;
posix_memalign((void **)&aq, 128, SIZE*SIZE*sizeof(uint32_t));
for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
aq[idx] = (uint32_t)rand();
}
uint32_t *restrict ar;
posix_memalign((void **)&ar, 128, SIZE*SIZE*sizeof(uint32_t));
for (size_t idx = 0; idx < SIZE*SIZE; idx++) {
ar[idx] = (uint32_t)rand();
}
// Inlined kernel follows. This is for warm-up.
kernel(ap + 0, aq + 0, ar + 0);
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
#pragma clang loop unroll(disable)
for (unsigned long bench_itr = 0; bench_itr < 10; ++bench_itr) {
kernel(ap + 0, aq + 0, ar + 0);
}
clock_gettime(CLOCK_MONOTONIC, &end);
struct timespec delta = ts_diff(start, end);
printf("cpu: %llds %lldns\n", (long long)delta.tv_sec, (long long)delta.tv_nsec);
free(ap);
free(aq);
free(ar);
return 0;
}
How would you generate constants for all dimensions in, say, a composition of two convolutional layers? How would you name them?
How would you generate constants for all dimensions in, say, a composition of two convolutional layers? How would you name them?
@samkaufman I apologize if my suggestion was not helpful. Is it feasible to only produce reasonable constants (e.g., what I showed in my previous example)? I am still studying your code and there are many that I do not comprehend yet. This issue pertains to readability and maintainability, so it is not necessary to prioritize it. I also wanted to inquire about the possibility of this issue since I am unsure how to implement my suggestion. If you believe it is not significant, please feel free to close this issue. I am sorry for any inconvenience.
Closing as stale.