Perform harmonic summing of a PowerSeries object.
To keep the cleanness of just using the PowerSeries class rather than implementing a PowerSeries2D-like class with a single contiguous storage buffer, we make the decision here to use std::vectors of PowerSeries objects for the output of the method. To use these on the device, it is necessary to first create thrust::device_vector of device pointers that can itself be passed as a raw pointer to any function. Here this is done by first creating a host_vector, filling it with raw pointer casts of each output series and finally copying that host_vector back to the device where it can be used.
105 PUSH_NVTX_RANGE(
"cuda_Hrms_process",0);
106 PANDA_LOG_DEBUG <<
"GPU ID: "<<gpu.device_id();
108 PUSH_NVTX_RANGE(
"cuda_Hrms_process_prepare",1);
110 thrust::host_vector<T*> output_ptrs_host(output.size());
112 for (
int idx=0; idx<output.size(); ++idx)
114 auto& series = output[idx];
115 double hnum = (double)(1<<(idx+1));
117 series.resize(input.size());
120 series.degrees_of_freedom(input.degrees_of_freedom()*hnum);
123 series.frequency_step((input.frequency_step().value()/hnum) * data::hz);
126 output_ptrs_host[idx] = thrust::raw_pointer_cast(series.data());
130 thrust::device_vector<T*> output_ptrs_device = output_ptrs_host;
131 auto input_ptr = thrust::raw_pointer_cast(input.data());
132 auto output_ptrs = thrust::raw_pointer_cast(output_ptrs_device.data());
133 thrust::counting_iterator<unsigned> begin(0);
134 thrust::counting_iterator<unsigned> end = begin + input.size();
137 PUSH_NVTX_RANGE(
"cuda_Hrms_process_execute_kernels",2);
142 switch (output.size())
145 thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,1>(input_ptr,output_ptrs));
148 thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,2>(input_ptr,output_ptrs));
151 thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,3>(input_ptr,output_ptrs));
154 thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,4>(input_ptr,output_ptrs));
157 thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,5>(input_ptr,output_ptrs));
160 panda::Error(
"Invalid number of sums requested of Hrms.");