Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
DedispersionStrategy.cpp
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "cheetah/cuda_utils/cuda_errorhandling.h"
25 #include "cheetah/data/TimeFrequency.h"
26 #include "cheetah/data/Units.h"
27 #include "panda/Resource.h"
28 #include "panda/Log.h"
29 #include <memory>
30 #include <algorithm>
31 #include <limits>
32 
33 namespace ska {
34 namespace cheetah {
35 namespace ddtr {
36 namespace astroaccelerate{
37 
38 template<typename NumericalRep, typename OptimizationParameterT>
39 DedispersionStrategy<NumericalRep , OptimizationParameterT>::DedispersionStrategy(const data::TimeFrequency<Cpu,NumericalRep>& chunk, const ddtr::DedispersionTrialPlan& plan, std::size_t gpu_memory)
40 {
41  _nsamp = chunk.number_of_spectra();
42  _tsamp = chunk.sample_interval();
43  _nchans = chunk.number_of_channels();
44  auto t = chunk.low_high_frequencies();
45  _fch1 = t.second;
46  _foff = (t.second-t.first)/((double)_nchans);
47  _dm_constant = plan.dm_constant();
48  for(auto it = plan.begin_range(); it!=plan.end_range(); ++it)
49  {
50  _user_dm_low.push_back(it->dm_start());
51  _user_dm_high.push_back(it->dm_end());
52  _user_dm_step.push_back(it->dm_step());
53  _in_bin.push_back(1);
54  _out_bin.push_back(1);
55  }
56  _range = _user_dm_low.size();
57  _SPS_mem_requirement = memory_requirement_of_SPS();
58  _max_ndms = 0;
59  make_strategy(gpu_memory);
60 }
61 
62 template<typename NumericalRep, typename OptimizationParameterT>
63 DedispersionStrategy<NumericalRep , OptimizationParameterT>::~DedispersionStrategy()
64 {
65 }
66 
67 template<typename NumericalRep, typename OptimizationParameterT>
69 {
70  return _range;
71 }
72 
73 template<typename NumericalRep, typename OptimizationParameterT>
74 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::user_dm_low() const
75 {
76  return _user_dm_low;
77 }
78 
79 template<typename NumericalRep, typename OptimizationParameterT>
80 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::user_dm_high() const
81 {
82  return _user_dm_high;
83 }
84 
85 template<typename NumericalRep, typename OptimizationParameterT>
86 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::user_dm_step() const
87 {
88  return _user_dm_step;
89 }
90 
91 template<typename NumericalRep, typename OptimizationParameterT>
93 {
94  return _in_bin;
95 }
96 
97 template<typename NumericalRep, typename OptimizationParameterT>
99 {
100  return _out_bin;
101 }
102 
103 template<typename NumericalRep, typename OptimizationParameterT>
105 {
106  return _maxshift;
107 }
108 
109 template<typename NumericalRep, typename OptimizationParameterT>
110 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::dm_low() const
111 {
112  return _dm_low;
113 }
114 
115 template<typename NumericalRep, typename OptimizationParameterT>
116 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::dm_high() const
117 {
118  return _dm_high;
119 }
120 
121 template<typename NumericalRep, typename OptimizationParameterT>
122 std::vector<typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm> DedispersionStrategy<NumericalRep , OptimizationParameterT>::dm_step() const
123 {
124  return _dm_step;
125 }
126 
127 template<typename NumericalRep, typename OptimizationParameterT>
129 {
130  return _dm_step;
131 }
132 
133 template<typename NumericalRep, typename OptimizationParameterT>
135 {
136  return _ndms;
137 }
138 
139 template<typename NumericalRep, typename OptimizationParameterT>
141 {
142  return _max_ndms;
143 }
144 
145 template<typename NumericalRep, typename OptimizationParameterT>
147 {
148  return _total_ndms;
149 }
150 
151 template<typename NumericalRep, typename OptimizationParameterT>
152 typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::Dm DedispersionStrategy<NumericalRep , OptimizationParameterT>::max_dm() const
153 {
154  return _max_dm;
155 }
156 
157 template<typename NumericalRep, typename OptimizationParameterT>
159 {
160  return _t_processed;
161 }
162 
163 template<typename NumericalRep, typename OptimizationParameterT>
165 {
166  return sizeof(NumericalRep)*8;
167 }
168 
169 template<typename NumericalRep, typename OptimizationParameterT>
171 {
172  return _nifs;
173 }
174 
175 template<typename NumericalRep, typename OptimizationParameterT>
177 {
178  _nifs = value;
179 }
180 
181 template<typename NumericalRep, typename OptimizationParameterT>
182 typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::TimeType DedispersionStrategy<NumericalRep , OptimizationParameterT>::tsamp() const
183 {
184  return _tsamp;
185 }
186 
187 template<typename NumericalRep, typename OptimizationParameterT>
188 typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::FrequencyType DedispersionStrategy<NumericalRep , OptimizationParameterT>::fch1() const
189 {
190  return _fch1;
191 }
192 
193 template<typename NumericalRep, typename OptimizationParameterT>
194 typename DedispersionStrategy<NumericalRep , OptimizationParameterT>::FrequencyType DedispersionStrategy<NumericalRep , OptimizationParameterT>::foff() const
195 {
196  return _foff;
197 }
198 
199 template<typename NumericalRep, typename OptimizationParameterT>
201 {
202  return _nsamp;
203 }
204 
205 template<typename NumericalRep, typename OptimizationParameterT>
207 {
208  return _nchans;
209 }
210 
211 template<typename NumericalRep, typename OptimizationParameterT>
213 {
214  return _num_tchunks;
215 }
216 
217 template<typename NumericalRep, typename OptimizationParameterT>
219 {
220  return((std::size_t) (5.5*sizeof(float) + 2*sizeof(ushort)));
221 }
222 
223 // Most of the make_strategy code left unchanged. Remember the originla
224 // code is written for unsigned short. I have made changes where ever possible to
225 // using the Numericalrep template parameter.
226 template<typename NumericalRep, typename OptimizationParameterT>
228 {
229 
230  _gpu_memory = gpu_memory;
231  // This method relies on defining points when nsamps is a multiple of
232  // nchans - bin on the diagonal or a fraction of it.
233 
234  unsigned int maxshift_high = 0;
235  _maxshift = 0;
236  float n;
237 
238  _dm_low.resize(_range,0);
239  _dm_high.resize(_range,0);
240  _dm_step.resize(_range,0);
241  _ndms.resize(_range,0);
242  _dmshifts.resize(_nchans,0);
243 
244  for (unsigned int c = 0; c < _nchans; c++)
245  {
246  _dmshifts[c] = std::abs((float)(_dm_constant.value() * ((1.0 / pow(_fch1.value()-_foff.value(), 2.0f )) - (1.0 / pow(_fch1.value(), 2.0f)))));
247  }
248 
249  for (unsigned int i = 0; i < _range; i++)
250  {
251  modff(( ( (int) ( ( _user_dm_high[i] - _user_dm_low[i] ) / _user_dm_step[i] ) + OptimizationParameterT::SDIVINDM ) / OptimizationParameterT::SDIVINDM ), &n);
252  _ndms[i] = (unsigned int) ( (unsigned int) n * OptimizationParameterT::SDIVINDM );
253  if (_max_ndms < _ndms[i])
254  _max_ndms = _ndms[i];
255  _total_ndms = _total_ndms + _ndms[i];
256  }
257 
258  _dm_low[0] = _user_dm_low[0];
259  _dm_high[0] = _ndms[0] * _user_dm_step[0];
260  _dm_step[0] = _user_dm_step[0];
261 
262  float modulo = (float)(2 * OptimizationParameterT::SNUMREG * OptimizationParameterT::SDIVINT);
263  for (unsigned int i = 1; i < _range; i++)
264  {
265  _dm_low[i] = _dm_high[i - 1];
266  _dm_high[i] = _dm_low[i] + _ndms[i] * _user_dm_step[i];
267  _dm_step[i] = _user_dm_step[i];
268 
269  if (_in_bin[i - 1] > 1) {
270  _maxshift = (int) ceil(( ( _dm_low[i - 1].value() + _dm_step[i - 1].value() * _ndms[i - 1] ) * _dmshifts[_nchans - 1] ) / _tsamp.value());
271  _maxshift = (int)ceil(((float)_maxshift / (float)_in_bin[i - 1] + modulo) / modulo);
272  _maxshift = (int)(_maxshift * modulo * _in_bin[i - 1]);
273  if (_maxshift > maxshift_high)
274  maxshift_high = _maxshift;
275  }
276  }
277 
278  if (_in_bin[_range - 1] > 1) {
279  _maxshift = (unsigned int) ceil(( ( _dm_low[_range - 1].value() + _dm_step[_range - 1].value() * _ndms[_range - 1] ) * _dmshifts[_nchans - 1] ) / _tsamp.value());
280  _maxshift = (int)ceil(((float)_maxshift / (float)_in_bin[_range - 1] + modulo) / modulo);
281  _maxshift = (unsigned int)(_maxshift * modulo * _in_bin[_range - 1]);
282  if (_maxshift > maxshift_high)
283  maxshift_high = _maxshift;
284  }
285 
286  if (maxshift_high == 0) {
287  maxshift_high = (unsigned int) ceil(( ( _dm_low[_range - 1].value() + _dm_step[_range - 1].value() * ( _ndms[_range - 1] ) ) * _dmshifts[_nchans - 1] ) / _tsamp.value());
288  }
289  _max_dm = ceil(_dm_high[_range - 1]);
290  _maxshift = ( maxshift_high + ( OptimizationParameterT::SNUMREG * sizeof(NumericalRep) * OptimizationParameterT::SDIVINT ) );
291 
292  if (_maxshift >= _nsamp) {
293  PANDA_LOG_ERROR<<"ERROR!! Your maximum DM trial exceeds the number of samples you have.\nReduce your maximum DM trial \n";
294  exit(1);
295  }
296 
297  unsigned int max_tsamps;
298 
299  // Allocate memory to store the t_processed ranges:
300  _t_processed.resize(_range);
301 
302 
303  if (_nchans < _max_ndms ) {
304  // This means that we can cornerturn into the allocated output buffer
305  // without increasing the memory needed
306 
307  // Maximum number of samples we can fit in our GPU RAM is then given by:
308  max_tsamps = (unsigned int) ( (gpu_memory) / ( sizeof(NumericalRep)*_nchans + sizeof(float)*(_max_ndms) +
309  (size_t)(_SPS_mem_requirement*OptimizationParameterT::MIN_DMS_PER_SPS_RUN )));
310 
311  // Check that we dont have an out of range maxshift:
312  if (_maxshift > max_tsamps) {
313  PANDA_LOG_ERROR<<"\nERROR!! Your GPU doesn't have enough memory for this number of dispersion trials. \n"
314  <<"Reduce your maximum dm or increase the size of your dm step";
315  exit(0);
316  }
317 
318  // Next check to see if nsamp fits in GPU RAM:
319  if (_nsamp < max_tsamps) {
320  // We have case 1)
321  // Allocate memory to hold the values of nsamps to be processed
322  unsigned int local_t_processed = (unsigned int) floor(( (float) ( _nsamp - _maxshift ) / (float) _in_bin[_range - 1] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
323  local_t_processed = local_t_processed * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ) * _in_bin[_range - 1];
324  for (unsigned int i = 0; i < _range; i++) {
325  _t_processed[i].resize(1);
326  _t_processed[i][0] = (int) floor(( (float) ( local_t_processed ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
327  _t_processed[i][0] = _t_processed[i][0] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
328  }
329  _num_tchunks = 1;
330 
331  }
332  else {
333  // We have case 3)
334  // Work out how many time samples we can fit into ram
335  int samp_block_size = max_tsamps - _maxshift;
336  //int samp_block_size = max_tsamps;
337 
338  // Work out how many blocks of time samples we need to complete the processing
339  // upto nsamp-maxshift
340  //int num_blocks = (int) floor(( (float) nsamp - ( *maxshift ) )) / ( (float) ( samp_block_size ) ) + 1;
341 
342  // Find the common integer amount of samples between all bins
343  int local_t_processed = (int) floor(( (float) ( samp_block_size ) / (float) _in_bin[_range - 1] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
344  unsigned int num_blocks;
345  if(local_t_processed == 0) {
346  num_blocks = 0;
347 
348  } else {
349  local_t_processed = local_t_processed * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ) * _in_bin[_range - 1];
350  num_blocks = (int) floor(( (float) _nsamp - ((float)_maxshift) )) / ( (float) ( local_t_processed ) );
351  }
352 
353  // Work out the remaining fraction to be processed
354  int remainder = _nsamp - ( num_blocks * local_t_processed ) - _maxshift;
355  remainder = (int) floor((float) remainder / (float) _in_bin[_range - 1]) / (float) ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
356  remainder = remainder * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ) * _in_bin[_range - 1];
357 
358  for (unsigned int i = 0; i < _range; i++) {
359  // Allocate memory to hold the values of nsamps to be processed
360  _t_processed[i].resize(num_blocks+1);
361  // Remember the last block holds less!
362  for (unsigned j = 0; j < num_blocks ; j++) {
363  _t_processed[i][j] = (int) floor(( (float) ( local_t_processed ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ));
364  _t_processed[i][j] = _t_processed[i][j] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
365  }
366  // fractional bit
367  _t_processed[i][num_blocks] = (int) floor(( (float) ( remainder ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ));
368  _t_processed[i][num_blocks] = _t_processed[i][num_blocks] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
369  }
370  _num_tchunks = num_blocks + 1;
371  }
372  }
373  else {
374  // This means that we cannot cornerturn into the allocated output buffer
375  // without increasing the memory needed. Set the output buffer to be as large as the input buffer:
376 
377  // Maximum number of samples we can fit in our GPU RAM is then given by:
378  max_tsamps = (unsigned int) ( ( gpu_memory ) / ( _nchans * ( sizeof(float) + sizeof(unsigned short) )+ _SPS_mem_requirement*OptimizationParameterT::MIN_DMS_PER_SPS_RUN ));
379 
380  // Check that we dont have an out of range maxshift:
381  if (_maxshift > max_tsamps) {
382  PANDA_LOG_ERROR<<"\nERROR!! Your GPU doens't have enough memory for this number of dispersion trials."
383  <<"\nReduce your maximum dm or increase the size of your dm step";
384  exit(0);
385  }
386 
387  // Next check to see if nsamp fits in GPU RAM:
388  if (_nsamp < max_tsamps) {
389  // We have case 2)
390  // Allocate memory to hold the values of nsamps to be processed
391  int local_t_processed = (int) floor(( (float) ( _nsamp - _maxshift ) / (float) _in_bin[_range - 1] ) / (float) ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ));
392  local_t_processed = local_t_processed * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ) * _in_bin[_range - 1];
393  for (unsigned int i = 0; i < _range; i++) {
394  _t_processed[i].resize(1);
395  _t_processed[i][0] = (int) floor(( (float) ( local_t_processed ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ));
396  _t_processed[i][0] = _t_processed[i][0] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
397  }
398  _num_tchunks = 1;
399 
400  }
401  else {
402  // We have case 4)
403  // Work out how many time samples we can fit into ram
404  unsigned int samp_block_size = max_tsamps - _maxshift;
405 
406  // Work out how many blocks of time samples we need to complete the processing
407  // upto nsamp-maxshift
408  //int num_blocks = (int) floor(( (float) nsamp - (float) ( *maxshift ) ) / ( (float) samp_block_size ));
409 
410  // Find the common integer amount of samples between all bins
411  unsigned int local_t_processed = (unsigned int) floor(( (float) ( samp_block_size ) / (float) _in_bin[_range - 1] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
412  local_t_processed = local_t_processed * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG ) * _in_bin[_range - 1];
413 
414  // samp_block_size was not used to calculate remainder instead there is local_t_processed which might be different
415  unsigned int num_blocks = (unsigned int) floor(( (float) _nsamp - (float) _maxshift ) / ( (float) local_t_processed ));
416 
417  // Work out the remaining fraction to be processed
418  unsigned int remainder = _nsamp - ( num_blocks * local_t_processed ) - _maxshift;
419 
420  for (unsigned int i = 0; i < _range; i++) {
421  // Allocate memory to hold the values of nsamps to be processed
422  _t_processed[i].resize(num_blocks + 1);
423  // Remember the last block holds less!
424  for (unsigned int j = 0; j < num_blocks; j++) {
425  _t_processed[i][j] = (int) floor(( (float) ( local_t_processed ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
426  _t_processed[i][j] = _t_processed[i][j] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
427  }
428  // fractional bit
429  _t_processed[i][num_blocks] = (int) floor(( (float) ( remainder ) / (float) _in_bin[i] ) / (float) ( OptimizationParameterT::SDIVINT * 2 * OptimizationParameterT::SNUMREG ));
430  _t_processed[i][num_blocks] = _t_processed[i][num_blocks] * ( OptimizationParameterT::SDIVINT * sizeof(NumericalRep) * OptimizationParameterT::SNUMREG );
431  }
432  _num_tchunks = num_blocks + 1;
433 
434  }
435  }
436 
437  _dedispersed_time_samples = 0;
438  for (unsigned t = 0; t < _num_tchunks; ++t)
439  {
440  _dedispersed_time_samples += (_t_processed[0][t] * _in_bin[0]);
441  }
442 }
443 
444 
445 }//astroacclerate
446 }//sps
447 }//cheetah
448 }//ska
unsigned int nifs() const
The number of IF channels.
unsigned int total_ndms() const
The total number of dm.
FrequencyType foff() const
Frequency channel width.
std::vector< unsigned int > ndms() const
An array containing the number of dms for each range.
std::vector< Dm > user_dm_step() const
An array containing dm step of each dm range, specified by the user.
unsigned int num_tchunks() const
The number of chunks the data are divided in.
unsigned int maxshift() const
Value used to make sure that dms from dm_low to dm_high are used.
static constexpr unsigned int nbits()
The number of bits of the input data.
std::vector< float > dmshifts() const
An array containing a constant associated with each channel to perform dedispersion algorithm...
std::vector< Dm > dm_step() const
An array containing the step size of each dm range.
std::vector< Dm > dm_high() const
An array containing the highest bound of each dm range.
unsigned int range() const
The number of dm ranges.
FrequencyType fch1() const
Frequency of the first channel.
Some limits and constants for FLDO.
Definition: Brdz.h:35
std::vector< Dm > dm_low() const
An array containing the lowest bound of each dm range.
unsigned int max_ndms() const
The maximum number of dm.
unsigned int nsamp() const
The number of time samples.
std::vector< Dm > user_dm_low() const
An array containing lowest band of each dm range, specified by the user.
std::vector< std::vector< int > > t_processed() const
The number of time samples required to search for a dm in each dm range.
std::vector< Dm > user_dm_high() const
An array containing highest band of each dm range, specified by the user.
unsigned int nchans() const
The number of frequency channels.