#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <iostream>
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include <thrust\unique.h>
#include <thrust/binary_search.h>
#include <thrust\adjacent_difference.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define VERBOSE
#define NO_HISTOGRAM
/********/
/* MAIN */
/********/
int main() {
const int N = 1048576;
//const int N = 20;
//const int N = 128;
TimingGPU timerGPU;
// --- Initialize random seed
srand(time(NULL));
thrust::host_vector<int> h_code(N);
for (int k = 0; k < N; k++) {
// --- Generate random numbers between 0 and 9
h_code[k] = (rand() % 10);
}
thrust::device_vector<int> d_code(h_code);
//thrust::device_vector<unsigned int> d_counting(N);
thrust::sort(d_code.begin(), d_code.end());
h_code = d_code;
timerGPU.StartCounter();
#ifdef NO_HISTOGRAM
// --- The number of d_cumsum bins is equal to the maximum value plus one
int num_bins = d_code.back() + 1;
thrust::device_vector<int> d_code_unique(num_bins);
thrust::unique_copy(d_code.begin(), d_code.end(), d_code_unique.begin());
thrust::device_vector<int> d_counting(num_bins);
thrust::upper_bound(d_code.begin(), d_code.end(), d_code_unique.begin(), d_code_unique.end(), d_counting.begin());
#else
thrust::device_vector<int> d_cumsum;
// --- The number of d_cumsum bins is equal to the maximum value plus one
int num_bins = d_code.back() + 1;
// --- Resize d_cumsum storage
d_cumsum.resize(num_bins);
// --- Find the end of each bin of values - Cumulative d_cumsum
thrust::counting_iterator<int> search_begin(0);
thrust::upper_bound(d_code.begin(), d_code.end(), search_begin, search_begin + num_bins, d_cumsum.begin());
// --- Compute the histogram by taking differences of the cumulative d_cumsum
//thrust::device_vector<int> d_counting(num_bins);
//thrust::adjacent_difference(d_cumsum.begin(), d_cumsum.end(), d_counting.begin());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
#ifdef VERBOSE
thrust::host_vector<int> h_counting(d_counting);
printf("After\n");
for (int k = 0; k < N; k++) printf("code = %i\n", h_code[k]);
#ifndef NO_HISTOGRAM
thrust::host_vector<int> h_cumsum(d_cumsum);
printf("\nCounting\n");
for (int k = 0; k < num_bins; k++) printf("element = %i; counting = %i; cumsum = %i\n", k, h_counting[k], h_cumsum[k]);
#else
thrust::host_vector<int> h_code_unique(d_code_unique);
printf("\nCounting\n");
for (int k = 0; k < N; k++) printf("element = %i; counting = %i\n", h_code_unique[k], h_counting[k]);
#endif
#endif
}
// Get most frequent element
// Get index of the maximum frequency
int num_keys = new_end.first - output_keys.begin();
thrust::device_vector<int>::iterator iter = thrust::max_element(output_freqs.begin(), output_freqs.begin() + num_keys);
unsigned int index = iter - output_freqs.begin();
int most_frequent_key = output_keys[index];
int most_frequent_val = output_freqs[index]; // Frequencies
4条答案
按热度按时间fslejnso1#
您可以通过首先对数字进行排序,然后进行关键字缩减来实现直方图。
最直接的方法是使用
thrust::sort
,然后使用thrust::reduce_by_key
。它通常也比基于原子的临时装箱快得多。这是一个example。hi3rlvi22#
我想您可以在CUDA示例中找到帮助,特别是直方图示例。它们是GPU计算SDK的一部分。你可以在www.example.com找到它http://developer.nvidia.com/cuda-cc-sdk-code-samples#histogram,他们甚至有一份解释算法的白皮书。
hlswsv353#
我在比较重复问题thrust count occurence中提出的两种方法,即:
1.使用
thrust::counting_iterator
和thrust::upper_bound
,遵循直方图Thrust示例;1.使用
thrust::unique_copy
和thrust::upper_bound
。下面,请看一个完整的例子。
第一种方法被证明是最快的。在NVIDIA GTX 960卡上,我对许多
N = 1048576
数组元素进行了以下计时:请注意,没有严格的必要显式计算相邻差,因为如果需要,可以在内核处理期间手动完成此操作。
ve7v8dk24#
正如其他人所说,您可以使用
sort & reduce_by_key
方法来计算频率。在我的例子中,我需要获取数组的模式(最大频率/出现次数),所以这里是我的解决方案:1 -首先,我们创建两个新数组,一个包含输入数据的副本,另一个填充1以稍后减少它(求和):
然后,我们对键进行排序,因为
reduce_by_key
函数需要对数组进行排序。3 -稍后,我们为(唯一)密钥及其频率创建两个输出向量:
4 -最后,我们通过键执行约简:
5 -...如果我们愿意,我们可以得到最频繁的元素