I am using thrust::reduce inside a functor which is an argument in thrust::transform_reduce. The situation looks like a nested thrust algorithm. The compilation succeeds but it runs with error:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): cudaEventSynchronize in future::wait: an illegal memory access was encountered
Aborted (core dumped)
The code is as followed:
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cmath>
#include <boost/concept_check.hpp>
struct aFuntor : public thrust::unary_function<int, int>
{
aFuntor(int* av__, int* bv__, const int& N__) : av_(av__), bv_(bv__), N_(N__) {};
__host__ __device__
int operator()(const int& idx)
{
thrust::device_ptr<int> av_dpt = thrust::device_pointer_cast(av_);
int res = thrust::reduce(av_dpt, av_dpt+N_);
return res;
}
int* av_;
int* bv_;
int N_;
};
int main(void)
{
int N = 5;
std::vector<int> av = {0,1,3,5};
std::vector<int> bv = {0,10,20,30};
thrust::device_vector<int> av_d(N);
thrust::device_vector<int> bv_d(N);
av_d = av; bv_d = bv;
// initial value of the reduction
int init=0;
// binary operations
thrust::plus<int> bin_op;
int res =
thrust::transform_reduce(thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N-1),
aFuntor(thrust::raw_pointer_cast(av_d.data()),
thrust::raw_pointer_cast(bv_d.data()),
N),
init,
bin_op);
std::cout << "result is: " << res << std::endl;
return 0;
}
does thrust support this kind of nested structure? or there isn't any way around except having to redesign my algorithm? AFAIK there are algorithms that are difficult to expose parallelism?
Thank you in advance!