543877815/uestc_Internet_plus_course_project

分布式并行的mpi程序的cache优化有一个bug

InmessionanteJR opened this issue · 4 comments

建议在marked = (char *) malloc(Block_size)这句之前先判断:if(Block_num==0) Block_size=size

原因:您的测试用例用的是p=8,n=1e8,L3_cache=6e6。由于这个测试用例中每个进程所需处理的数组分成L3_cache大小的block后,超过了1个block,所以没有问题。

但是如果用小一点的n,比如1e4,此时每个进程只需用一个block,理应给marked数组分配(1e4/2-1)/8大小的空间,但是用malloc分配的marked数组还是用L3_cache/4/p计算出的,显然有问题。

嗯....我依稀记得当时跑数字较小的程序的时候确实有bug没有解决

但是现在我电脑上已经没有环境了

所以,我决定把这个issue放在这里让大家也能看到

这个代码应该是没有问题的:

#include <mpi.h>
#include <math.h>
#include <stdio.h>


int main(int argc, char *argv[]) {
    int count;              /* Local prime count */
    double elapsed_time;    /* Parallel execution time */
    int first;              /* Index of first multiple */
    int global_count;       /* Global prime count */
    char *marked;           /* Portion of 2,...,'n' */
    int id;                 /* Process ID number */
    int index;              /* Index of current prime */
    int low_index;          /* Lowest index on this proc */
    int low_value;          /* Lowest value on this proc */
    int high_index;         /* Highest index on this proc */
    int high_value;         /* Highest value on this proc */
    int n;                  /* Sieving from 2, ..., 'n' */
    int p;                  /* Number of processes */
    int proc0_size;         /* Size of proc 0's subarray */
    int prime;              /* Current prime */
    int size;               /* Elements in 'marked' */
    char *sub_marked;       /* sub_mark array */
    int sub_low_index;      /* Lowest index on sub_array */
    int sub_low_value;      /* Lowest array on sub_array */
    int sub_high_index;     /* Highest index on sub_array */
    int sub_high_value;     /* Highest index on sub_array */



    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &id);

    MPI_Comm_size(MPI_COMM_WORLD, &p);

    MPI_Barrier(MPI_COMM_WORLD);


    if (argc != 3) {
        if (!id) printf("Command line: %s <m> \n", argv[0]);
        MPI_Finalize();
        exit(1);
    }

    n = atoi(argv[1]);


    // Bail out if all the primes used for sieving are not all held by process 0
    proc0_size = (n - 1) / p;

    if ((2 + proc0_size) < (int) sqrt((double) n)) {
        if (!id) printf("Too many processes \n");
        MPI_Finalize();
        exit(1);
    }

    elapsed_time = -MPI_Wtime(); //开始计时的位置也是有讲究的

    // step 1: build 'sub_marked' array
    int sub_n = (int) sqrt((double) n);
    int sub_N = (sub_n - 1) / 2;

    sub_low_index = 0; 
    sub_high_index = sub_N / p - 1;

    sub_low_value = sub_low_index * 2 + 3; 
    sub_high_value = (sub_high_index + 1) * 2 + 1;

    sub_marked = (char *) malloc(sub_n);
    if (sub_marked == NULL) {
        printf("Cannot allocate enough memory \n");
        MPI_Finalize();
        exit(1);
    }

    for (int i = 0; i < sub_n; i++) sub_marked[i] = 0;

    index = 0;

    prime = 3;
    do {
        first = (prime * prime - sub_low_value) / 2;
        
        for (int i = first; i < sub_n; i += prime) {
            sub_marked[i] = 1;

        }
        while (sub_marked[++index]);
        prime = index * 2 + 3; 
    } while (prime * prime <= sub_n);

    // step 2: build 'marked' array with blocks
    int N = (n - 1) / 2;
    low_index = id * N / p;            
    high_index = (id + 1) * N / p - 1; 

    low_value = low_index * 2 + 3; 
    high_value = (high_index + 1) * 2 + 1;
    size = (high_value - low_value) / 2 + 1;   

    int CACHE_size = atoi(argv[2]);

    int CACHE_int = CACHE_size / 4;

    int Block_size = CACHE_int / p ;
    int Block_num = size / Block_size;
    int Block_remain = size % Block_size;


    int Block_id = 0;
    int Block_N = Block_size - 1;

    int Block_low_index = Block_id * Block_N + low_index;

    int Block_high_index = (Block_id + 1) * Block_N - 1 + low_index;
    int Block_low_value = Block_low_index * 2 + 3;
    int Block_high_value = (Block_high_index + 1) * 2 + 1;
    int Block_count;

    // allocate this process 's share of the array
    if(Block_num==0) Block_size=size;
    marked = (char *) malloc(Block_size);
    if (marked == NULL) {
        printf("Cannot allocate enough memory \n");
        MPI_Finalize();
        exit(1);
    }

    count = 0;

    while (Block_id <= Block_num) {

        index = 0;

        prime = 3;

        Block_count = 0;

        for (int i = 0; i < Block_size; i++) marked[i] = 0;

        do {
            if (prime * prime > Block_low_value) {
                first = (prime * prime - Block_low_value) / 2;
            } else {
                if (!(Block_low_value % prime)) first = 0;
                else if (Block_low_value % prime % 2 == 0) first = prime - ((Block_low_value % prime) / 2);
                else first = (prime - (Block_low_value % prime)) / 2;
            }

            for (int i = first; i < Block_size; i += prime) {
                marked[i] = 1;
            }

            while (sub_marked[++index]);

            prime = index * 2 + 3; 

        } while (prime * prime <= Block_high_value);


        for (int i = 0; i < Block_size; i++) {
            if (marked[i] == 0) {
                Block_count++;
            }
        }

        count += Block_count;

        Block_id++;
        Block_low_index = Block_id * Block_N + Block_id + low_index;
        Block_high_index = (Block_id + 1) * Block_N + (Block_id + 1)- 1 + low_index;


        Block_low_value = Block_low_index * 2 + 3;
        if (Block_id == Block_num) {
            Block_high_value = high_value;
            Block_high_index = high_index;
            Block_size = (Block_high_value - Block_low_value) / 2 + 1;
        } else {
            Block_high_value = (Block_high_index + 1) * 2 + 1;
        }

    }

    MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    elapsed_time += MPI_Wtime();
    MPI_Finalize();

    if (!id) {
        printf("There are %d primes less than or equal to %d \n", global_count + 1, n);
        printf("SIEVE (%d) %10.6f\n", p, elapsed_time);
        printf("\n");
    }
    return 0;
}