分布式并行的mpi程序的cache优化有一个bug
InmessionanteJR opened this issue · 4 comments
InmessionanteJR commented
建议在marked = (char *) malloc(Block_size)
这句之前先判断:if(Block_num==0) Block_size=size
原因:您的测试用例用的是p=8,n=1e8,L3_cache=6e6。由于这个测试用例中每个进程所需处理的数组分成L3_cache大小的block后,超过了1个block,所以没有问题。
但是如果用小一点的n,比如1e4,此时每个进程只需用一个block,理应给marked数组分配(1e4/2-1)/8大小的空间,但是用malloc分配的marked数组还是用L3_cache/4/p计算出的,显然有问题。
543877815 commented
嗯....我依稀记得当时跑数字较小的程序的时候确实有bug没有解决
543877815 commented
但是现在我电脑上已经没有环境了
543877815 commented
所以,我决定把这个issue放在这里让大家也能看到
InmessionanteJR commented
这个代码应该是没有问题的:
#include <mpi.h>
#include <math.h>
#include <stdio.h>
int main(int argc, char *argv[]) {
int count; /* Local prime count */
double elapsed_time; /* Parallel execution time */
int first; /* Index of first multiple */
int global_count; /* Global prime count */
char *marked; /* Portion of 2,...,'n' */
int id; /* Process ID number */
int index; /* Index of current prime */
int low_index; /* Lowest index on this proc */
int low_value; /* Lowest value on this proc */
int high_index; /* Highest index on this proc */
int high_value; /* Highest value on this proc */
int n; /* Sieving from 2, ..., 'n' */
int p; /* Number of processes */
int proc0_size; /* Size of proc 0's subarray */
int prime; /* Current prime */
int size; /* Elements in 'marked' */
char *sub_marked; /* sub_mark array */
int sub_low_index; /* Lowest index on sub_array */
int sub_low_value; /* Lowest array on sub_array */
int sub_high_index; /* Highest index on sub_array */
int sub_high_value; /* Highest index on sub_array */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Barrier(MPI_COMM_WORLD);
if (argc != 3) {
if (!id) printf("Command line: %s <m> \n", argv[0]);
MPI_Finalize();
exit(1);
}
n = atoi(argv[1]);
// Bail out if all the primes used for sieving are not all held by process 0
proc0_size = (n - 1) / p;
if ((2 + proc0_size) < (int) sqrt((double) n)) {
if (!id) printf("Too many processes \n");
MPI_Finalize();
exit(1);
}
elapsed_time = -MPI_Wtime(); //开始计时的位置也是有讲究的
// step 1: build 'sub_marked' array
int sub_n = (int) sqrt((double) n);
int sub_N = (sub_n - 1) / 2;
sub_low_index = 0;
sub_high_index = sub_N / p - 1;
sub_low_value = sub_low_index * 2 + 3;
sub_high_value = (sub_high_index + 1) * 2 + 1;
sub_marked = (char *) malloc(sub_n);
if (sub_marked == NULL) {
printf("Cannot allocate enough memory \n");
MPI_Finalize();
exit(1);
}
for (int i = 0; i < sub_n; i++) sub_marked[i] = 0;
index = 0;
prime = 3;
do {
first = (prime * prime - sub_low_value) / 2;
for (int i = first; i < sub_n; i += prime) {
sub_marked[i] = 1;
}
while (sub_marked[++index]);
prime = index * 2 + 3;
} while (prime * prime <= sub_n);
// step 2: build 'marked' array with blocks
int N = (n - 1) / 2;
low_index = id * N / p;
high_index = (id + 1) * N / p - 1;
low_value = low_index * 2 + 3;
high_value = (high_index + 1) * 2 + 1;
size = (high_value - low_value) / 2 + 1;
int CACHE_size = atoi(argv[2]);
int CACHE_int = CACHE_size / 4;
int Block_size = CACHE_int / p ;
int Block_num = size / Block_size;
int Block_remain = size % Block_size;
int Block_id = 0;
int Block_N = Block_size - 1;
int Block_low_index = Block_id * Block_N + low_index;
int Block_high_index = (Block_id + 1) * Block_N - 1 + low_index;
int Block_low_value = Block_low_index * 2 + 3;
int Block_high_value = (Block_high_index + 1) * 2 + 1;
int Block_count;
// allocate this process 's share of the array
if(Block_num==0) Block_size=size;
marked = (char *) malloc(Block_size);
if (marked == NULL) {
printf("Cannot allocate enough memory \n");
MPI_Finalize();
exit(1);
}
count = 0;
while (Block_id <= Block_num) {
index = 0;
prime = 3;
Block_count = 0;
for (int i = 0; i < Block_size; i++) marked[i] = 0;
do {
if (prime * prime > Block_low_value) {
first = (prime * prime - Block_low_value) / 2;
} else {
if (!(Block_low_value % prime)) first = 0;
else if (Block_low_value % prime % 2 == 0) first = prime - ((Block_low_value % prime) / 2);
else first = (prime - (Block_low_value % prime)) / 2;
}
for (int i = first; i < Block_size; i += prime) {
marked[i] = 1;
}
while (sub_marked[++index]);
prime = index * 2 + 3;
} while (prime * prime <= Block_high_value);
for (int i = 0; i < Block_size; i++) {
if (marked[i] == 0) {
Block_count++;
}
}
count += Block_count;
Block_id++;
Block_low_index = Block_id * Block_N + Block_id + low_index;
Block_high_index = (Block_id + 1) * Block_N + (Block_id + 1)- 1 + low_index;
Block_low_value = Block_low_index * 2 + 3;
if (Block_id == Block_num) {
Block_high_value = high_value;
Block_high_index = high_index;
Block_size = (Block_high_value - Block_low_value) / 2 + 1;
} else {
Block_high_value = (Block_high_index + 1) * 2 + 1;
}
}
MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
elapsed_time += MPI_Wtime();
MPI_Finalize();
if (!id) {
printf("There are %d primes less than or equal to %d \n", global_count + 1, n);
printf("SIEVE (%d) %10.6f\n", p, elapsed_time);
printf("\n");
}
return 0;
}