c++ 使用MPI和OpenMP并行运行代码时伸缩性差

7d7tgy0s  于 2022-12-15  发布在  其他
关注(0)|答案(1)|浏览(168)

我有以下实现:

int main(int argc, char **argv)
{
    int n_runs = 100; // Number of runs
    int seed = 1; 
    int arraySize = 400;
    /////////////////////////////////////////////////////////////////////
   
    // initialise the random number generator using a fixed seed for reproducibility
    srand(seed); 

    MPI_Init(nullptr, nullptr);

    int rank, n_procs;
    MPI_Comm_size(MPI_COMM_WORLD, &n_procs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    // Initialise the probability step and results vectors.
    // We have 21 probabilities between 0 and 1 (inclusive).
    double prob_step = 0.05;
    std::vector<double> avg_steps_over_p(21,0);
    std::vector<double> trans_avg_steps_over_p(21,0);
    std::vector<int> min_steps_over_p(21,0);
    std::vector<int> trans_min_steps_over_p(21,0);
    std::vector<int> max_steps_over_p(21,0);
    std::vector<int> trans_max_steps_over_p(21,0);
    std::vector<double> prob_reached_end(21,0);
    std::vector<double> trans_prob_reached_end(21,0);

    // Loop over probabilities and compute the number of steps before the model burns out,
    // averaged over n_runs.
    for (int i = rank; i < 21; i+=n_procs)
    {
        double prob = i*prob_step;

        int min_steps = std::numeric_limits<int>::max();
        int max_steps = 0;

        for (int i_run = 0; i_run < n_runs; ++i_run)
        {
            Results result = forest_fire(arraySize, prob);
            
            avg_steps_over_p[i] += result.stepCount;

            if (result.fireReachedEnd) ++prob_reached_end[i];
            if (result.stepCount < min_steps) min_steps = result.stepCount;
            if (result.stepCount > max_steps) max_steps = result.stepCount;
        }

        avg_steps_over_p[i] /= n_runs;
        min_steps_over_p[i] = min_steps;
        max_steps_over_p[i] = max_steps;
        prob_reached_end[i] = 1.0*prob_reached_end[i] / n_runs;
    }

    // Worker processes communicate their results to the master process.
    if (rank > 0)
    {
        MPI_Send(&avg_steps_over_p[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
        MPI_Send(&min_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
        MPI_Send(&max_steps_over_p[0], 21, MPI_INT, 0, rank, MPI_COMM_WORLD);
        MPI_Send(&prob_reached_end[0], 21, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
    } else
    {
        for (int i = 1; i < n_procs; ++i)
        {
            MPI_Status status;
            MPI_Recv(&trans_avg_steps_over_p[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
            for (int j = i; j < 21; j += n_procs) {
                 avg_steps_over_p[j] = trans_avg_steps_over_p[j];
            }
            MPI_Recv(&trans_min_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
            for (int j = i; j < 21; j += n_procs) {
                 min_steps_over_p[j] = trans_min_steps_over_p[j];
            }

            MPI_Recv(&trans_max_steps_over_p[0], 21, MPI_INT, i, i, MPI_COMM_WORLD, &status);
            for (int j = i; j < 21; j += n_procs) {
                 max_steps_over_p[j] = trans_max_steps_over_p[j];
            }

            MPI_Recv(&trans_prob_reached_end[0], 21, MPI_DOUBLE, i, i, MPI_COMM_WORLD, &status);
            for (int j = i; j < 21; j += n_procs) {
                 prob_reached_end[j] = trans_prob_reached_end[j];
            }
        }

        // Master process outputs the final result.
        std::cout << "Probability, Avg. Steps, Min. Steps, Max Steps" << std::endl;
        for (int i = 0; i < 21; ++i)
        {
            double prob = i * prob_step;
            std::cout << prob << "," << avg_steps_over_p[i]
                      << "," << min_steps_over_p[i] << "," 
                      << max_steps_over_p[i] << "," 
                      << prob_reached_end[i] << std::endl;
        }
    }

    MPI_Finalize();
    return 0;
}

我尝试了以下参数:scaling analysis
我是并行化和HPC的新手,所以如果我说错了请原谅我,但我希望在增加每个节点的任务和每个任务的CPU时,加速比大于3。我还没有尝试所有的可能性,但我相信这里的行为是奇怪的,特别是在将每个任务的CPU数保持为1,并将每个节点的任务数从2-〉3-〉4增加时。我知道这并不像更高的内核使用率=更高的速度那么简单,但据我所知这些应该能加速。
是我的代码效率低下导致了这种情况,还是这是预期的行为?我的完整代码在这里,其中包括OpenMP并行化:https://www.codedump.xyz/cpp/Y5Rr68L8Mncmx1Sd .
非常感谢。

bweufnob

bweufnob1#

1.我不知道forest_fire例程中有多少个操作,但最好是几万个,否则您没有足够的工作来克服并行化开销。
1.等级0按顺序处理所有进程。您应该使用MPI_Irecv。我想知道集体操作是否更可取。
1.你用[i]建立索引,这是一个跨步操作。正如我在你的另一个问题中指出的,这是浪费空间的。每个进程应该只分配该进程所需的空间。

相关问题