我有以下实现:
int main(int argc, char **argv)
{
int n_runs = 100; // Number of runs
int seed = 1;
int arraySize = 400;
/////////////////////////////////////////////////////////////////////
// initialise the random number generator using a fixed seed for reproducibility
srand(seed);
MPI_Init(nullptr, nullptr);
int rank, n_procs;
MPI_Comm_size(MPI_COMM_WORLD, &n_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Initialise the probability step and results vectors.
// We have 21 probabilities between 0 and 1 (inclusive).
double prob_step = 0.05;
std::vector<double> avg_steps_over_p(21,0);
std::vector<double> trans_avg_steps_over_p(21,0);
std::vector<int> min_steps_over_p(21,0);
std::vector<int> trans_min_steps_over_p(21,0);
std::vector<int> max_steps_over_p(21,0);
std::vector<int> trans_max_steps_over_p(21,0);
std::vector<double> prob_reached_end(21,0);
std::vector<double> trans_prob_reached_end(21,0);
// Loop over probabilities and compute the number of steps before the model burns out,
// averaged over n_runs.
for (int i = rank; i < 21; i+=n_procs)
{
double prob = i*prob_step;
int min_steps = std::numeric_limits<int>::max();
int max_steps = 0;
for (int i_run = 0; i_run < n_runs; ++i_run)
{
Results result = forest_fire(arraySize, prob);
avg_steps_over_p[i] += result.stepCount;
if (result.fireReachedEnd) ++prob_reached_end[i];
if (result.stepCount < min_steps) min_steps = result.stepCount;
if (result.stepCount > max_steps) max_steps = result.stepCount;
}
avg_steps_over_p[i] /= n_runs;
min_steps_over_p[i] = min_steps;
max_steps_over_p[i] = max_steps;
prob_reached_end[i] = 1.0*prob_reached_end[i] / n_runs;
}
// Worker processes communicate their results to the master process.
if (rank > 0)
{
MPI_Send(&avg_steps_over_p[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
MPI_Send(&min_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
MPI_Send(&max_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
MPI_Send(&prob_reached_end[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
} else
{
for (int i = 1; i < n_procs; ++i)
{
MPI_Status status;
MPI_Recv(&trans_avg_steps_over_p[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
avg_steps_over_p[j] = trans_avg_steps_over_p[j];
}
MPI_Recv(&trans_min_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
min_steps_over_p[j] = trans_min_steps_over_p[j];
}
MPI_Recv(&trans_max_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
max_steps_over_p[j] = trans_max_steps_over_p[j];
}
MPI_Recv(&trans_prob_reached_end[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
for (int j = i; j < 21; j += n_procs) {
prob_reached_end[j] = trans_prob_reached_end[j];
}
}
// Master process outputs the final result.
std::cout << "Probability, Avg. Steps, Min. Steps, Max Steps" << std::endl;
for (int i = 0; i < 21; ++i)
{
double prob = i * prob_step;
std::cout << prob << "," << avg_steps_over_p[i]
<< "," << min_steps_over_p[i] << ","
<< max_steps_over_p[i] << ","
<< prob_reached_end[i] << std::endl;
}
}
MPI_Finalize();
return 0;
}
我尝试了以下参数:scaling analysis
我是并行化和HPC的新手,所以如果我说错了请原谅我,但我希望在增加每个节点的任务和每个任务的CPU时,加速比大于3。我还没有尝试所有的可能性,但我相信这里的行为是奇怪的,特别是在将每个任务的CPU数保持为1,并将每个节点的任务数从2-〉3-〉4增加时。我知道这并不像更高的内核使用率=更高的速度那么简单,但据我所知这些应该能加速。
是我的代码效率低下导致了这种情况,还是这是预期的行为?我的完整代码在这里,其中包括OpenMP并行化:https://www.codedump.xyz/cpp/Y5Rr68L8Mncmx1Sd .
非常感谢。
1条答案
按热度按时间bweufnob1#
1.我不知道forest_fire例程中有多少个操作,但最好是几万个,否则您没有足够的工作来克服并行化开销。
1.等级0按顺序处理所有进程。您应该使用MPI_Irecv。我想知道集体操作是否更可取。
1.你用[i]建立索引,这是一个跨步操作。正如我在你的另一个问题中指出的,这是浪费空间的。每个进程应该只分配该进程所需的空间。