Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
Hrms.cu
1 #include "cheetah/hrms/cuda/Hrms.cuh"
2 #include "cheetah/data/Units.h"
3 #include "cheetah/cuda_utils/cuda_thrust.h"
4 #include "cheetah/cuda_utils/cuda_errorhandling.h"
5 #include "cheetah/cuda_utils/nvtx.h"
6 #include "panda/Error.h"
7 #include "panda/Log.h"
8 
9 namespace ska {
10 namespace cheetah {
11 namespace hrms {
12 namespace cuda {
13 namespace detail {
14 
26 template <typename T, unsigned max_harms>
28 {
29  public:
36  HarmonicSumFunctor(T const* input, T** output);
37 
42  inline __host__ __device__
43  void operator() (unsigned idx) const;
44 
45  private:
46  T const* _input;
47  T** _output;
48 };
49 
50 template <typename T, unsigned max_harms>
52  : _input(input)
53  , _output(output)
54 {
55 }
56 
62 template <typename T, unsigned max_harms>
63 inline __host__ __device__ void HarmonicSumFunctor<T, max_harms>::operator() (unsigned idx) const
64 {
65  unsigned int ii, jj, harm;
66  float fharm;
67  T val = _input[idx];
68 
69  //#pragma unroll -- these want to be unrolled
70  for (jj=0; jj<max_harms; jj++)
71  {
72  harm = 1<<(jj+1);
73  fharm = 1.0f/harm;
74 
75  //#pragma unroll -- these want to be unrolled
76  for (ii=1;ii<harm;ii+=2)
77  {
78  float factor = ii*fharm;
79  val += _input[(unsigned int) (idx*factor + 0.5f)];
80  }
81  _output[jj][idx] = val;
82  }
83 }
84 
85 } // namespace detail
86 
87 
88 template <typename T, typename Alloc>
89 void Hrms::process(ResourceType& gpu,
91  std::vector<data::PowerSeries<cheetah::Cuda,T,Alloc>>& output)
92 {
105  PUSH_NVTX_RANGE("cuda_Hrms_process",0);
106  PANDA_LOG_DEBUG << "GPU ID: "<<gpu.device_id();
107 
108  PUSH_NVTX_RANGE("cuda_Hrms_process_prepare",1);
109  //container for device pointers
110  thrust::host_vector<T*> output_ptrs_host(output.size());
111 
112  for (int idx=0; idx<output.size(); ++idx)
113  {
114  auto& series = output[idx];
115  double hnum = (double)(1<<(idx+1));
116  //resize the outputs to the correct size
117  series.resize(input.size());
118 
119  //recalculate the dof
120  series.degrees_of_freedom(input.degrees_of_freedom()*hnum);
121 
122  //set the outputs metadata
123  series.frequency_step((input.frequency_step().value()/hnum) * data::hz);
124 
125  //store device pointer in host_vector
126  output_ptrs_host[idx] = thrust::raw_pointer_cast(series.data());
127  }
128 
129  //copy the host array of device pointers to the device
130  thrust::device_vector<T*> output_ptrs_device = output_ptrs_host;
131  auto input_ptr = thrust::raw_pointer_cast(input.data());
132  auto output_ptrs = thrust::raw_pointer_cast(output_ptrs_device.data());
133  thrust::counting_iterator<unsigned> begin(0);
134  thrust::counting_iterator<unsigned> end = begin + input.size();
135 
136  POP_NVTX_RANGE; // cuda_Hrms_process_prepare
137  PUSH_NVTX_RANGE("cuda_Hrms_process_execute_kernels",2);
138  //Here we use a switch case based on the number of PowerSeries objects in
139  //the output vector. This determines the number of harmonic sums to be performed.
140  //The switch case then allows us to dispatch to a partial specialisation of the
141  //harmonic summing functor.
142  switch (output.size())
143  {
144  case 1:
145  thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,1>(input_ptr,output_ptrs));
146  break;
147  case 2:
148  thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,2>(input_ptr,output_ptrs));
149  break;
150  case 3:
151  thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,3>(input_ptr,output_ptrs));
152  break;
153  case 4:
154  thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,4>(input_ptr,output_ptrs));
155  break;
156  case 5:
157  thrust::for_each(thrust::cuda::par, begin, end, detail::HarmonicSumFunctor<T,5>(input_ptr,output_ptrs));
158  break;
159  default:
160  panda::Error("Invalid number of sums requested of Hrms.");
161  }
162  POP_NVTX_RANGE; // cuda_Hrms_process_execute_kernels
163  POP_NVTX_RANGE; // cuda_Hrms_process
164 }
165 
166 } //cuda
167 } //hrms
168 } //cheetah
169 } //ska
FourierFrequencyType const & frequency_step() const
Retrieve the frequency step of the series.
void process(ResourceType &gpu, data::PowerSeries< cheetah::Cuda, T, Alloc > const &input, std::vector< data::PowerSeries< cheetah::Cuda, T, Alloc >> &output)
Perform harmonic summing of a PowerSeries object.
Definition: Hrms.cu:89
A thrust functor for harmonic summing.
Definition: Hrms.cu:27
Some limits and constants for FLDO.
Definition: Brdz.h:35
__host__ __device__ void operator()(unsigned idx) const
Definition: Hrms.cu:63
Class for power series (detected FrequencySeries).
Definition: PowerSeries.h:58
std::size_t size() const
the size of the series
Definition: Series.cpp:109
double degrees_of_freedom() const
Retreive the (assumed) degrees of freedom of the data distribution.
Definition: PowerSeries.cpp:58
HarmonicSumFunctor(T const *input, T **output)
Construct a new instance.
Definition: Hrms.cu:51