c++ 为什么在向量上“调整大小-删除”比“删除-删除”更快?

3pvhb19x  于 2023-06-07  发布在  其他
关注(0)|答案(1)|浏览(110)

在C++中,当涉及到从容器中删除几个元素时,有一个'erase-remove' idiom,并且有关于另一种“resize-remove”方式的讨论,例如here。人们说'erase-remove'比'resize-remove'更好,但根据我的测试,后者在向量上(稍微)更快。那么,当涉及到向量时,我应该使用“resize-remove”吗?
下面是我的基准测试代码:

#include <benchmark/benchmark.h>

#include <algorithm>
#include <functional>
#include <iostream>
#include <random>
#include <vector>

using namespace std;

constexpr size_t N_ELEMS = 1000000;
constexpr int MAX_VAL = N_ELEMS / 10;
constexpr int THRESH = MAX_VAL / 5 * 3;

static vector<int> generate_input() {
  vector<int> nums(N_ELEMS);

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dist(0, N_ELEMS);

  std::generate(nums.begin(), nums.end(), std::bind(dist, std::ref(gen)));

  return std::move(nums);
}

static void bm_erase_remove(benchmark::State &state) {
  for (auto _ : state) {
    state.PauseTiming();
    auto nums = generate_input();
    state.ResumeTiming();
    nums.erase(std::remove_if(nums.begin(), nums.end(),
                              [](int x) { return x < THRESH; }),
               nums.end());
    benchmark::DoNotOptimize(nums);
  }
}
BENCHMARK(bm_erase_remove);

static void bm_resize_remove(benchmark::State &state) {
  for (auto _ : state) {
    state.PauseTiming();
    auto nums = generate_input();
    state.ResumeTiming();

    nums.resize(std::distance(
        nums.begin(), std::remove_if(nums.begin(), nums.end(),
                                     [](int x) { return x < THRESH; })));
    benchmark::DoNotOptimize(nums);
  }
}
BENCHMARK(bm_resize_remove);

BENCHMARK_MAIN();

输出:

$ g++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out 
2023-05-24T20:07:22+08:00
Running ./a.out
Run on (16 X 3193.91 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 0.16, 0.14, 0.16
-----------------------------------------------------------
Benchmark                 Time             CPU   Iterations
-----------------------------------------------------------
bm_erase_remove      822789 ns       759162 ns          838
bm_resize_remove     818217 ns       754749 ns          935

使用clang++时,差异更大:

$ clang++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
Load Average: 0.25, 0.18, 0.17
-----------------------------------------------------------
Benchmark                 Time             CPU   Iterations
-----------------------------------------------------------
bm_erase_remove     1165085 ns      1074667 ns          611
bm_resize_remove     958856 ns       884584 ns          782

额外信息:

  • g++的版本是13.1.1clang++的版本是15.0.7
  • 我在WSL上使用Arch Linux,内核版本是5.15.90.1-microsoft-standard-WSL2
  • CPU型号为AMD Ryzen 7 6800H with Radeon Graphics
    **更新:**有趣的是,当我单独运行基准测试时(使用benchmark_filter选项),结果是等效的。这是因为缓存吗?如果是,缓存机制是如何工作的?
    **UPDATE(2023/5/25):**如果两个BENCHMARK语句被交换,则显示完全相反的结果。
ar5n3qh5

ar5n3qh51#

它们的性能差不多,这是因为std::remove_if是唯一一个修改数组的函数,而不同之处在于其他函数,std::vector::resize创建了一个realloc来适应新的大小(std::distance只返回大小,所以它可以忽略不计),而std::vector::erase只是采用了容器,使它稍微快一点。
正如@Peter Cordes “实际上保证不会重新分配” 所指出的那样,如果新的大小较小,std::vector::resize不会调整向量的大小,因此区别应该是erase(vs,clang)执行的额外移动(vs,clang)和resize(vs,clang)不执行的。
为了能够测量差异,generate_input()需要为所有测试返回相同的向量,在您的实现中,每个调用都返回一个新的随机向量,这使得无法区分运行差异和向量变化。
因此,@463035818_is_not_a_number提出了一个有趣的观点,但出于与前面相同的原因,这些函数之间没有区别。这并不意味着情况是一样的,对于一个结构体来说,一个坏分支的代价要大得多,这使得std::remove_if成为优化的主要目标,请参见下面的windows图。
所有图均来自50次运行,绿色是最佳运行和平均值之间的范围,红色是平均值到最差结果的范围,这是为了显示运行间的方差。
i5-1155G7 + 3200MHz RAM on manjaro with clang-13 and -O3
1600X + 1067MHz RAM on windows 10 (22H2) with vs2022 and /O2
1600X + 1067MHz RAM on windows 10 (22H2) with clang-16 and -O3 -pthread
1600X + 1067MHz RAM on ubuntu 22 with clang-13 and -O3
在vs中,似乎有一个优化 bug(?),这使得simple_remove的表现不如u32和u64。

#include <cstdint>
#include <random>
#include <cstring>
#include <vector>
#include <tuple>

// #include <typeinfo> // For clang

#include <benchmark/benchmark.h>

#pragma warning(disable:4267)  // Conversion errors
#pragma warning(disable:4244)  // Conversion errors

constexpr int N_ELEMS   = 500;
constexpr int RAND_SEED = 8888;

template <typename T>
struct Filler {

    T * ptr = nullptr;

    int64_t padd [sizeof(T)];

    Filler() {
        ptr = new T(0);
        memset(padd, 0, sizeof(T) * 8);
    }

    Filler(const T num){
        ptr = new T(num);
        for (int64_t& e : padd){
            e = num;
        }
    }

    Filler(const Filler& other){
        ptr = new T(*other.ptr);
        memcpy(padd, other.padd, sizeof(T) * 8);
    }

    ~Filler() {
        delete ptr;
    }

    Filler& operator=(Filler const& other){
        memcpy(padd, other.padd, sizeof(T) * 8);
        *ptr = *other.ptr;
        return *this;
    }

    inline bool operator <  (const Filler& other) { return *ptr <  *other.ptr; }
    inline bool operator <= (const Filler& other) { return *ptr <= *other.ptr; }
    inline bool operator >  (const Filler& other) { return *ptr >  *other.ptr; }
    inline bool operator >= (const Filler& other) { return *ptr >= *other.ptr; }
    inline bool operator == (const Filler& other) { return *ptr == *other.ptr; }
    inline bool operator != (const Filler& other) { return *ptr != *other.ptr; }

    inline bool operator <  (const T other) { return *ptr <  other; }
    inline bool operator <= (const T other) { return *ptr <= other; }
    inline bool operator >  (const T other) { return *ptr >  other; }
    inline bool operator >= (const T other) { return *ptr >= other; }
    inline bool operator == (const T other) { return *ptr == other; }
    inline bool operator != (const T other) { return *ptr != other; }

};

static size_t THRESH;   

template <typename T>
struct Foo {

    static std::vector<T> generate_input(size_t max = 0) {
        static size_t dist_max = 0;
        static std::vector<T> nums;

        if (nums.empty() || max){

            if (max) {
                THRESH = max / 2;
                dist_max = max;
            }

            std::mt19937 gen(RAND_SEED);
            std::uniform_int_distribution<uint64_t> dist(0, dist_max);

            for (auto& n : nums = std::vector<T>(N_ELEMS)){
                n = T(dist(gen));
            }
        }
        return nums;
    }

    static void just_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming(); 
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        std::ignore = std::remove_if(
            nums.begin(), nums.end(),
            [](T x) { return x < THRESH; }
        );

        benchmark::DoNotOptimize(nums);
      }
    }

    static void erase_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        nums.erase(
            std::remove_if(
                nums.begin(), nums.end(),
                [](T x) { return x < THRESH; }
            ),
            nums.end()
        );

        benchmark::DoNotOptimize(nums);
      }
    }

    static void resize_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming(); 
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        nums.resize(
          std::distance(
            nums.begin(), 
            std::remove_if(
                nums.begin(), nums.end(),
                [](T x) { return x < THRESH; }
            )
          )
        );

        benchmark::DoNotOptimize(nums);
      }
    }

    static void simple_remove(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        T * n = &nums.front();
        T * m = &nums.front();

        const T thresh = T(THRESH);
        const T * back = &nums.back();
        do {

            if (*m >= thresh){
                *(n++) = std::move(*m);
            }

        } while (m++ < back);

        nums.resize(n - &nums.front());

        benchmark::DoNotOptimize(nums);
      }
    }

    static void simple_remove_unroll(benchmark::State &state) {
      for (auto _ : state) {

        state.PauseTiming();
        std::vector<T> nums = generate_input();
        state.ResumeTiming();

        T * n = &nums.front();
        T * m = &nums.front();
        const T thresh = T(THRESH);
        const T * back = &nums.back();

        switch (nums.size() % 4) {
            case 3:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
            case 2:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
            case 1:
                if (*m >= thresh){
                    *(n++) = std::move(*(m++));
                } else {
                    m++;
                }
        }

        do {

            if (*(m + 0) >= thresh){ *(n++) = std::move(*(m + 0)); }
            if (*(m + 1) >= thresh){ *(n++) = std::move(*(m + 1)); }
            if (*(m + 2) >= thresh){ *(n++) = std::move(*(m + 2)); }
            if (*(m + 3) >= thresh){ *(n++) = std::move(*(m + 3)); }

            m += 4;

        } while (m < back);

        nums.resize(n - &nums.front());

        benchmark::DoNotOptimize(nums);
      }
    }

};

template<typename T>
void benchmark_batch(size_t max_num) {

    std::string type = typeid(T).name();
    Foo<T>::generate_input(max_num);

    benchmark::RegisterBenchmark(
        std::string("just_remove/") + type, 
        Foo<T>::just_remove
    );
    benchmark::RegisterBenchmark(
        std::string("erase_remove/") + type, 
        Foo<T>::erase_remove
    );
    benchmark::RegisterBenchmark(
        std::string("resize_remove/") + type, 
        Foo<T>::resize_remove
    );
    benchmark::RegisterBenchmark(
        std::string("simple_remove/") + type, 
        Foo<T>::simple_remove
    );
    benchmark::RegisterBenchmark(
        std::string("simple_remove_unroll/") + type, 
        Foo<T>::simple_remove_unroll
    );
}

int main(int argc, char** argv) {
 
    benchmark_batch<uint8_t>(INT8_MAX); 
    benchmark_batch<uint32_t>(INT32_MAX);
    benchmark_batch<uint64_t>(INT64_MAX);

    benchmark_batch<Filler<uint8_t>>(INT8_MAX);
    benchmark_batch<Filler<uint32_t>>(INT32_MAX);
    benchmark_batch<Filler<uint64_t>>(INT64_MAX);

    benchmark::Initialize(&argc, argv);
    benchmark::RunSpecifiedBenchmarks();
    benchmark::Shutdown();

    return 0;
}

以及重建情节的代码。

0..49 | % { .\Release\test_cxx.exe --benchmark_min_warmup_time=0.1 --benchmark_format=csv > "./data/run_$_.csv" }
Get-ChildItem ./data | Select-Object -ExpandProperty FullName | Import-Csv | Export-Csv .\benchmark.csv -NoTypeInformation -Append
import matplotlib.ticker as tck
import matplotlib.pyplot as plt
import csv

def read_data():
    data = {}
    test_len = 0
    with open('./build/benchmark.csv') as csvfile:
        reader  = csv.DictReader(csvfile)
        for row in reader :
            test_len += 1
            name = row['name'];
            if not name in data:
                data[name] = {
                    'min': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    }, 
                    'max': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    }, 
                    'avg': {
                        'iterations': float(row['iterations']),
                        'real_time': float(row['real_time']),
                        'cpu_time': float(row['cpu_time']),
                    },
                }
            else:
                for k in ['iterations', 'real_time', 'cpu_time']:
                    data[name]['avg'][k] += float(row[k])
                    if float(row[k]) < float(data[name]['min'][k]):
                        data[name]['min'][k] = float(row[k])
                    if float(row[k]) > float(data[name]['max'][k]):
                        data[name]['max'][k] = float(row[k])

        test_len /= len(data.keys())
        for k in data:
            for kk in data[k]['avg']:
                data[k]['avg'][kk] /= test_len
    return data

def plot_data(data, key):
    labels = []
    values = {
        'max': [], 'avg': [], 'min': [],
    }
    labels_struct = []
    values_struct = {
        'max': [], 'avg': [], 'min': [],
    }

    for k in list(data.keys()):
        if 'struct' in k or '6' in k:
            labels_struct.append(k.replace('/', '\n').replace('struct ', ''))
            values_struct['min'].append(data[k]['min'][key])
            values_struct['max'].append(data[k]['max'][key])
            values_struct['avg'].append(data[k]['avg'][key])
        else:
            labels.append(k.replace('/', '\n'))
            values['min'].append(data[k]['min'][key])
            values['max'].append(data[k]['max'][key])
            values['avg'].append(data[k]['avg'][key])

    return labels, values, labels_struct, values_struct

thickness = 0.8
benckmark_value = 'iterations'
colors = ['#1dad2b', '#af1c23', '#e0e0e0']

if __name__ == '__main__':

    data = read_data()
    labels, values, labels_struct, values_struct = plot_data(data, benckmark_value)

    fig = plt.figure(layout="constrained")
    spec = fig.add_gridspec(ncols=2, nrows=1)
    int_formater = lambda x, p: format(int(x), ',')

    ax0 = fig.add_subplot(spec[0, 0])
    ax0.set_ylabel(benckmark_value)
    ax0.set_title('std::vector<T>')
    ax0.set_xticklabels(labels, rotation=90)
    ax0.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))

    ax1 = fig.add_subplot(spec[0, 1])
    ax1.set_ylabel(benckmark_value)
    ax1.set_title('std::vector<Filler<T>>')
    ax1.set_xticklabels(labels_struct, rotation=90)
    ax1.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))

    for i, (k, v) in enumerate(values.items()):
        ax0.bar(labels, v, thickness, color=colors[i])

    for i, (k, v) in enumerate(values_struct.items()):
        ax1.bar(labels_struct, v, thickness, color=colors[i])

    plt.show()

相关问题