Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
FldoCuda.cpp
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "cheetah/fldo/cuda/Fldo.h"
25 #include "cheetah/fldo/cuda/Config.h"
26 #include "cheetah/fldo/cuda/detail/FldoCuda.h"
27 #include "cheetah/fldo/cuda/detail/Optimization.h"
28 #include "cheetah/cuda_utils/cuda_errorhandling.h"
29 #include <climits>
30 #include <fstream>
31 
32 namespace ska {
33 namespace cheetah {
34 namespace fldo {
35 namespace cuda {
36 
37 #ifdef ENABLE_CUDA
38 
39 template<typename NumericalT>
40 FldoCuda<NumericalT>::FldoCuda(fldo::Config const& config)
41  : _config(config)
42 {
43  //cuda configuration
44  fldo::cuda::Config const& cuda_config = _config.cuda_algo_config();
45  //initialize class data members
46  //max number of phases used in folding
47  _mfold_phases = cuda_config.phases();
48  _nsubbands = cuda_config.nsubbands();
49  _nsubints = cuda_config.nsubints();
50  //enable_split flag to enable/disable phase splitting in folding algorithm
51  _enable_split = cuda_config.enable_split();
52 }
53 
54 template<typename NumericalT>
55 std::shared_ptr<data::Ocld> FldoCuda<NumericalT>::operator()(ResourceType& gpu,
56  std::vector<std::shared_ptr<TimeFrequencyType>> const& input_data,
57  data::Scl const& candidates_list)
58 {
59 
60  // local copy formatted as espected by folder routines
61  std::vector<std::shared_ptr<TimeFrequencyType>> data;
62  data::Scl input_candidates = candidates_list;
63 
64  // standard control if no data in input return NULL
65  if ((0 == input_data.size()) || (0 == input_data[0]->number_of_spectra() )) {
66  PANDA_LOG_ERROR << "Fldo: data buffer size == 0";
67  return data::Ocld::make_shared();
68  }
69 
70  // Control on candidate list size
71  PANDA_LOG_DEBUG << "Fldo: Total candidates number == " << input_candidates.size();
72  PANDA_LOG << "Fldo: Total candidates number == " << input_candidates.size();
73 
74  // Candidates number > 0
75  if (0 == input_candidates.size() ) {
76  PANDA_LOG_ERROR << "Fldo: candidate buffer size == 0";
77  return data::Ocld::make_shared();
78  }
79 
80  // control and re-formatting of the input data
81  //
82  // Folding routine assumes input data are organized as a a vector of _nsubints
83  // components, each containing (approx) the same number of measures.
84  // data_reshape accepts a generic vector, and if correct, returns it
85  // unchanged. If not the case, it reshapes it as required.
86  //
87 
88  {
89  // compute total data sample size along the axis time
90  uint64_t totalsize = 0;
91  uint64_t minsize = ULLONG_MAX;
92  uint64_t maxsize = 0;
93  for (int i = 0; i < (int) input_data.size(); i++) {
94  totalsize += input_data[i]->number_of_spectra();
95  // verify if all chunks are the same. The chunk size =
96  // numer_of_time_sample_in_chunk * numer_of_channels
97  if (minsize > input_data[i]->data_size()){
98  minsize = input_data[i]->data_size();
99  }
100  if (maxsize < input_data[i]->data_size()) {
101  maxsize = input_data[i]->data_size();
102  }
103  }
104 
105  //check if the configured number of channels matches with the one inside data
106  //NOTE: we assume all elements of input_data have the same channels number
107  _nchannels = input_data[0]->number_of_channels();
108  if(_nchannels < fldo::cuda::min_frequencies || _nchannels > fldo::cuda::max_frequencies) {
109  PANDA_LOG_ERROR << "number of channels outside range for this algorithm ("
110  << min_frequencies << ", " << max_frequencies << "): " << _nchannels;
111  return data::Ocld::make_shared();
112  }
113 
114  //calculate the total number of samples
115  uint64_t nsamples = totalsize * _nchannels;
116 
117  // we want the data number to be a multiple also of the fldo::max_prebin value
118  if ((nsamples % (_nchannels * _nsubints * fldo::cuda::max_prebin)) != 0) {
119  nsamples = (uint64_t)((nsamples/(((float)_nsubints *_nchannels) * fldo::cuda::max_prebin)))
120  * (_nsubints*_nchannels * fldo::cuda::max_prebin);
121  if (nsamples < 1) {
122  PANDA_LOG_ERROR << "Not enough data to process at max rebin value" << fldo::cuda::max_prebin;
123  return data::Ocld::make_shared();
124  }
125  }
126  _nsamp = nsamples;
127 
128  // if maxsize == minsize we have the correct input data structure.
129  // if also input_data.size() == _nsubints all work is done.
130  if ((maxsize == minsize) && (input_data.size() == _nsubints) && (_nsamp/(_nsubints) == minsize)) {
131  PANDA_LOG_DEBUG << "Fldo: input data already formatted (" << totalsize << " measures)";
132  // dumb (and fast) copy
133  data = input_data;
134  } else {
135  // we build the correctly formatted data
136  PANDA_LOG_DEBUG << "Fldo: input data not correctly formatted (" << totalsize << " measures)";
137 
138  const size_t measures = _nsamp / _nsubints / _nchannels; // number of time-samples in a sub-integration
139  PANDA_LOG_DEBUG << "measures: " << measures;
140  // here we are sure all components of input_data have the same channels number
141  const size_t channels = _nchannels;
142  int input_ind = 0; // index inside input_data[]
143  int input_insideind = 0; // index inside input_data[]-><data>
144  size_t destination_start = 0; // index inside data[]->data()
145  for (int i = 0; i < (int) _nsubints; i++) {
146  // we build data[i] containing a full sub integration of
147  // length measures
148  data.emplace_back(new TimeFrequencyType(data::DimensionSize<data::Time>(measures), data::DimensionSize<data::Frequency>(channels)));
149  // start_time has to be re-calculated for each sub-int
150  // Wrong : data[i]->start_time(input_data[input_ind]->start_time());
151  // these does work. (input_data[0] is correct as input_data has no time holes).
152  time_t miot = input_data[0]->start_time().to_time_t();
153  miot += (measures*i*(input_data[0]->sample_interval().value()));
154  data[i]->start_time( miot);
155  data[i]->sample_interval(input_data[input_ind]->sample_interval());
156  data[i]->set_channel_frequencies(input_data[input_ind]->channel_frequencies().begin(),
157  input_data[input_ind]->channel_frequencies().end());
158  // we get measures numbers from input_data.
159  // NOTE we use the more efficient std::copy. This assume
160  // the current (1/2017) implementation of TimeData.
161  // A more general approach, using iterators, is much slower.
162  // TODO move this while inside data_reshape routine
163  // if we have enough data in input_data[input_ind] we
164  // copy them.
165  size_t left = measures * channels; // data still to be transferred
166  while (0 < left) {
167  size_t data_size = (input_data[input_ind]->end() - (input_data[input_ind]->begin() + input_insideind ));
168  if (left >= data_size) {
169  //PANDA_LOG_DEBUG << "(1) subint: " << i << " left: " << left << " data_size: " << data_size
170  // << " input_ind: " << input_ind << " input_insideind: " << input_insideind;
171  // we have enough data inside input_data[input_ind]
172  std::copy((input_data[input_ind]->begin() + input_insideind),
173  (input_data[input_ind]->begin() + input_insideind + data_size),
174  (data[i]->begin() + destination_start));
175  ++input_ind; // we have no more items in input_data[input_ind]
176  input_insideind = 0;
177  if (left == data_size) {
178  destination_start = 0;
179  } else {
180  destination_start += data_size; // update destination index
181  }
182  left -= data_size;
183  //PANDA_LOG_DEBUG << "(1): destination_start = " << destination_start;
184  } else {
185  // we have only current_copy data inside input_data[input_ind]
186  //PANDA_LOG_DEBUG << "(2) subint: " << i << " left: " << left << " data_size: " << data_size
187  // << " input_ind: " << input_ind << " input_insideind: " << input_insideind;
188  std::copy ( (input_data[input_ind]->begin() + input_insideind),
189  (input_data[input_ind]->begin() + input_insideind + left),
190  (data[i]->begin() + destination_start));
191  destination_start = 0; // update destination index
192  input_insideind += left; // we have no more items in input_data[input_ind]
193  left = 0;
194  }
195  }
196  }
197  }
198  }
199 
200  // compute total data sample size along the time axis
201  uint64_t totalsize = 0;
202  for (int i = 0; i < (int) data.size(); ++i) {
203  totalsize += data[i]->number_of_spectra();
204  }
205  PANDA_LOG << "Fldo: Total number of time samples " << totalsize
206  << " in " << data.size() << " chunks"
207  << " each with " << data[0]->number_of_channels()
208  << " frequency channels ";
209 
210  return run_folding(gpu, data, input_candidates);
211 }
212 
232 template<typename NumericalT>
233 void FldoCuda<NumericalT>::rebin_candidates(std::vector<util::CandidateRebin> &rebin, TimeType const tsamp, data::Scl& scl_data)
234 {
235  PANDA_LOG_DEBUG << "Call rebin_candidates";
236  //sort the candidates on period basis using lambda function (ascending order)
237  //std::sort (scl_data.begin(), scl_data.end(), [](CandidateType first, CandidateType second) {
238  // return (first.period() < second.period());
239  // }
240  // );
241  std::sort (scl_data.begin(), scl_data.end(), [](CandidateType first, CandidateType second) -> bool {
242  if (first.period() < second.period()) {
243  return true;
244  } else if (first.period() > second.period()) {
245  return false;
246  }
247  if (first.dm() < second.dm()) {
248  return true;
249  } else if (first.dm() > second.dm()) {
250  return false;
251  }
252  if (first.pdot() < second.pdot()) {
253  return true;
254  } else if (first.pdot() > second.pdot()) {
255  return false;
256  }
257  return false;
258  }
259  );
260 
261  // print candidates after sorting
262  for (data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin() ;
263  it != scl_data.end(); ++it) {
264  PANDA_LOG_DEBUG << "Sorted candidate "
265  << (*it).ident()
266  << " period "
267  << (*it).period()
268  << " pdot "
269  << (*it).pdot()
270  << " dm "
271  << (*it).dm() ;
272  }
273  float rebin_period[fldo::cuda::max_nrebin];
274  //To calculate the max spin period, we use the configured _mfold_phases value.
275  //Generally its default value corresponds to 128 because it represents the better compromise
276  //between performance and period time.
277  //In particular for the max period evaluation we use _mfold_phases - 1 (or _mfold_phases -2) because
278  //the folding kernel uses for no-split case (a) e split case (b):
279  //(a) the last one phase (128 if _mfold_phases =1 128) to store the phase 0
280  //(b) the last two phases (127 and 128 if _mfold_phases = 128) to store the phases 1 and 0
281  //In this way the gpu kernel does not use the modulo operator (see FoldInputDataKernel.cu)
282 
283  int max_num_of_phases = _mfold_phases - 2;
284  // We take into account the nosplit option. In this
285  // case we have (_mfold_phases - 1) !!
286  if (_enable_split == false) {
287  max_num_of_phases = _mfold_phases - 1;
288  }
289  float max_period = max_num_of_phases * tsamp.value() * fldo::cuda::max_prebin * 0.5;
290 
291  // implement the rebinning limit on period depending on the
292  // sampling time
293  for (int index = 0; index < (int) fldo::cuda::max_nrebin - 1; index ++) {
294  rebin_period[index] = max_num_of_phases * (1 << index) * tsamp.value() * 0.5;
295  PANDA_LOG_DEBUG <<"period[ " << index << "]: " << rebin_period[index];
296  }
297  rebin_period[fldo::cuda::max_nrebin - 1] = max_period;
298  int cand_idx = 0; // the index inside the sorted list of candidates
299  // get the pointer to the candidate list beginning
300  data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin() ;
301 
302  // initialize the rebin array
303  //loop on rebin values to identify the candidates whose period falls
304  //into current rebin time interval
305  //A rebin entry is valid if the element first and last are != -1
306  for (int rebin_idx = 0; rebin_idx < (int) fldo::cuda::max_nrebin; rebin_idx ++) {
307  //initialize the rebin value for each element of the vector
308  rebin[rebin_idx].rebin = 1 << rebin_idx;
309  //loop on candidates list to divide them into the different groups
310  //based on the rebin value. Each rebin group is identified by a
311  //maximum period. All the candidates with period less than this
312  //value, belong to the same rebin group.
313  while (it != scl_data.end()) {
314  //convert the candidate period in sec.
315  CandidateType::TimeType const cand_period ((*it).period());
316  // get the numerical candidate period in sec.
317  // OSS: we don't rely on CandidateType::TimeType unit. If it changes, the next line
318  // always give us the value of period in sec.
319  float const cand_period_value = boost::units::quantity_cast<float>(static_cast<boost::units::quantity<boost::units::si::time>>(cand_period));
320  if (cand_period_value >= rebin_period[rebin_idx]) {
321  break;
322  }
323  if (rebin[rebin_idx].first > -1) {
324  rebin[rebin_idx].last ++;
325  } else {
326  rebin[rebin_idx].rebin = 1 << rebin_idx;
327  rebin[rebin_idx].first = cand_idx;
328  rebin[rebin_idx].last = cand_idx;
329  }
330  //point to the next candidate
331  ++it;
332  //increment the candidate counter
333  ++cand_idx;
334  }
335  }
336  //only for check purpose.
337  for (int index = 0; index < (int) fldo::cuda::max_nrebin; index ++) {
338  PANDA_LOG_DEBUG << "rebin: " << rebin[index].rebin
339  << " first: " << rebin[index].first
340  << " last: " << rebin[index].last;
341  }
342 }
343 
359 template<typename NumericalT>
360 void FldoCuda<NumericalT>::load_constant_devmem(std::vector<util::CandidateRebin> &rebin, std::vector<double> delta_freq,
361  data::Scl const &scl_data, std::vector <int> &cand_phases)
362 {
363  //get the number of pulsar candidates
364  size_t const ncandidates = scl_data.size();
365  std::vector <float> dm(ncandidates); // vector with candidates dispersion measure
366  std::vector <double> period(ncandidates); // vector with candidates period
367  std::vector <double> pdot(ncandidates); // vector with candidates period derivative
368  std::vector <double> nu(ncandidates); // vector with candidates frequency
369  std::vector <double> nudot(ncandidates); // vector with the candidates frequency variation
370 
371  try {
372  size_t const nchan_per_subband = _nchannels/_nsubbands;
373  // we follow Chris advice
374  int index = 0; // index in candidate proprerties vector
375  //load candidates values inside the corresponding vectors
376  for (data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin();
377  it != scl_data.end(); ++it) {
378  // get the candidate period numerical value in sec.
379  // OSS: we don't rely on candidate period unit. If it changes, the next line
380  // always give us the value of period in sec.
381  double const cand_period_value = boost::units::quantity_cast<double>(static_cast<boost::units::quantity<boost::units::si::time>>((*it).period()));
382  double const cand_pdot = ((*it).pdot()).value();
383  period[index] = cand_period_value;
384  pdot[index] = cand_pdot;
385  dm[index] = ((*it).dm()).value();
386  nudot[index] = -cand_pdot/(cand_period_value * cand_period_value);
387  nu[index] = 1./cand_period_value;
388  ++index; // we follow Chris advice
389  }
390  PANDA_LOG_DEBUG << "Arrays of candidates values completed";
391  //update candidates values for those of them that have a number of
392  //phases < 32. In this case the number of phase bins of the candidate is
393  //proportionally increased to get a number of final phase bins > 32.
394  for (int i = 0; i < (int) fldo::cuda::max_nrebin; ++i) {
395  if ((rebin[i].rebin == 1 << i) && (rebin[i].first > -1)) {
396  //initialize the phases array with the rebinned number of phases
397  //for each candidate
398  double period_val = 0.;
399  for (int ncand = rebin[i].first; ncand <= rebin[i].last; ncand ++) {
400  //store the period into a temporary variable
401  period_val = period[ncand];
402  cand_phases[ncand] = 2 * ((int) (period[ncand] / (_tsamp * rebin[i].rebin)));
403  PANDA_LOG_DEBUG << "load_constant_devmem: cand_phases["
404  << ncand
405  << "]: "
406  << cand_phases[ncand];
407  // only if phase split is not enabled, we set a max limit
408  // on the number of natural phases
409  if ((cand_phases[ncand] > (int)_mfold_phases - 1) && (_enable_split == false)) {
410  cand_phases[ncand] = (int)_mfold_phases - 1;
411  }
412  PANDA_LOG_DEBUG << " Candidate: " << ncand
413  << " original cand_phases: " << cand_phases[ncand]
414  << " period: " << period[ncand]
415  << " dm: " << dm[ncand]
416  << " pdot: " << pdot[ncand]
417  << " rebin: " << rebin[i].rebin;
418  nudot[ncand] = -pdot[ncand]/(period_val * period_val);
419  nu[ncand] = 1./period_val;
420  }
421  }
422  }
423  util::load_constant_data(delta_freq.data(), nu.data(), nudot.data(), dm.data(),
424  cand_phases.data(), _nchannels, nchan_per_subband, _nsubints, _tsamp, ncandidates);
425  }
426  catch (std::runtime_error &e)
427  {
428  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in load_constant_mem(): "
429  << e.what();
430  throw e;
431  }
432  catch (...)
433  {
434  throw panda::Error("Catch unknown exception in load_constant_mem()");
435  PANDA_LOG_ERROR << "Catch unknown exception in load_constant_mem()";
436  }
437 }
438 
458 template<typename NumericalT>
459 void FldoCuda<NumericalT>::allocate_global_devmem(size_t ncandidates, std::vector<util::CandidateRebin> &rebin,
460  NumericalRep **d_in, NumericalRep ** h_in, float **d_outfprof,
461  float ** d_outprof, float ** d_folded, float ** d_weight)
462 {
463  try
464  {
465  // evaluate the pitch_dim for each rebin value
466  uint64_t nsamp_per_subint = _nsamp/_nsubints; // number of samples in a sub-integration
467  int nrow = nsamp_per_subint / _nchannels; // number of samples/sub-integration/channel
468  size_t pitch_size = 0; // optimal GPU dimension for memory allocation
469 
471  nrow = nsamp_per_subint / _nchannels;
472  // call cudaMallocPitch() to meet the alignment requirements for coalescing as the
473  // address is updated from row to row.
474  CUDA_ERROR_CHECK(cudaMallocPitch((void **)&(rebin[0].d_out), &pitch_size, nrow * sizeof(float), _nchannels));
475  // calculate the total dimension in bytes of the input data matrix
476  rebin[0].msize = pitch_size * _nchannels;
477  CUDA_ERROR_CHECK(cudaMemset((void*)rebin[0].d_out, 0, rebin[0].msize));
478  // get the number of row of the matrix
479  rebin[0].pitch_dim = pitch_size/sizeof(float);
480 
481  //loop on each valid rebinning value to allocate GPU memory for each rebinned matrix of input data
482  //OSS: skip the first because already allocated!!
483  for (std::vector<util::CandidateRebin>::iterator it = rebin.begin() + 1; it != rebin.end(); ++it) {
484  if ((*it).first > -1) {
485  nrow = nsamp_per_subint / (_nchannels * (*it).rebin);
486  CUDA_ERROR_CHECK(cudaMallocPitch((void **)&(*it).d_out, &pitch_size, nrow * sizeof(float), _nchannels));
487  // calculate the total dimension in bytes of the rebinned matrix
488  (*it).msize = pitch_size * _nchannels;
489  CUDA_ERROR_CHECK(cudaMemset((void*)(*it).d_out, 0, (*it).msize));
490  // get the number of row of the matrix
491  (*it).pitch_dim = pitch_size/sizeof(float);
492  PANDA_LOG_DEBUG << "Allocated " << (*it).msize
493  << " bytes for input rebinned data. "
494  << "rebin value: "
495  << (*it).rebin
496  << " original row size: "
497  << nrow
498  << " final row size: "
499  << (*it).pitch_dim;
500  }
501  }
502 
503  // map the host memory into the CUDA address space
504  uint64_t mem_size = 0;
505  //get the dimension in bytes of the host memory to allocate
506  mem_size = rebin[0].pitch_dim * _nchannels * sizeof(char);
507  PANDA_LOG_DEBUG << "host memory size to map: "
508  << mem_size
509  << " (nrow: "
510  << rebin[0].pitch_dim
511  << " x ncols: "
512  << _nchannels
513  << " x "
514  << sizeof(char)
515  << " bytes)";
516 
517  // set flag to allocate pinned host memory that is accessible to the device.
518  //NB: to call cudaHostAlloc() with cudaHostAllocMapped, the flag cudaDeviceMapHost
519  //has to be set (this is done in Fldo()).
520  CUDA_ERROR_CHECK(cudaHostAlloc((void **) h_in, mem_size, cudaHostAllocMapped));
521  // the device pointer to the memory may be obtained by calling cudaHostGetDevicePointer()
522  CUDA_ERROR_CHECK(cudaHostGetDevicePointer((void **)d_in, (void *)*h_in, 0));
523  PANDA_LOG_DEBUG << "host memory map success";
524 
525  // allocate global device memory to store profiles of candidates
526  //
527  // OSS: we use max number of phases (or bins) because the number of
528  // phases is different for each candidate. This value is calculated using the candidate
529  // period and the sampling time. We then allocate the max memory even if it's not
530  // used.
531  size_t profile_len = _mfold_phases * _nsubints * _nsubbands * sizeof(float);
532  mem_size = (uint64_t)profile_len * ncandidates;
533 
534  //CUDA_ERROR_CHECK(cudaMalloc((void **)&d_folded, mem_size ));
535  CUDA_ERROR_CHECK(cudaMalloc((void **)d_folded, mem_size ));
536  CUDA_ERROR_CHECK(cudaMemset((void*)*d_folded, 0, mem_size));
537  PANDA_LOG_DEBUG << "allocate_global_devmem: allocated "
538  << mem_size
539  << " bytes to store sub-integration folded data";
540  CUDA_ERROR_CHECK(cudaMalloc((void **)d_weight, mem_size ));
541  CUDA_ERROR_CHECK(cudaMemset((void*)*d_weight, 0, mem_size));
542  PANDA_LOG_DEBUG <<"allocate_global_devmem: allocated device memory for weights!";
543 
544  //dimension in bytes of the device memory to the profiles
545  //scrunched in freq (d_outfprof) for all candidates
546  mem_size = ncandidates * _nsubints * _mfold_phases * sizeof(float);
547  //allocate the device memory an reset its value
548  CUDA_ERROR_CHECK(cudaMalloc((void **)d_outfprof, mem_size ));
549  CUDA_ERROR_CHECK(cudaMemset((void*)*d_outfprof, 0, mem_size));
550 
551  //calculate dimension in bytes of the device memory to store the profiles
552  //scrunched in freq and time (d_outprof) for all candidates
553  mem_size = ncandidates * _mfold_phases * sizeof(float);
554  //allocate the device memory an reset its value
555  CUDA_ERROR_CHECK(cudaMalloc((void **)d_outprof, mem_size ));
556  CUDA_ERROR_CHECK(cudaMemset((void*)*d_outprof, 0, mem_size));
557  PANDA_LOG_DEBUG << "allocate_global_devmem: allocated " << mem_size
558  << " bytes to store out profiles data";
559  PANDA_LOG_DEBUG << "global memory allocation success";
560 
561  }
562  catch (std::runtime_error &e)
563  {
564  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in allocate_global_devmem(): "
565  << e.what();
566  throw e;
567  }
568  catch (...)
569  {
570  throw panda::Error("Caught unknown exception in allocate_global_devmem()");
571  PANDA_LOG_ERROR << "Caught unknown exception in allocate_global_devmem()";
572  }
573 }
574 
590 template<typename NumericalT>
591 void FldoCuda<NumericalT>::free_dev_memory(NumericalRep *h_in, float *d_folded, float *d_weight, float *d_outprof, float * d_outfprof)
592 {
593  if (h_in) {
594  CUDA_ERROR_CHECK(cudaFreeHost(h_in));
595  }
596  if (d_folded) {
597  CUDA_ERROR_CHECK(cudaFree(d_folded));
598  }
599  if (d_weight) {
600  CUDA_ERROR_CHECK(cudaFree(d_weight));
601  }
602  if (d_outprof) {
603  CUDA_ERROR_CHECK(cudaFree(d_outprof));
604  }
605  if (d_outfprof) {
606  CUDA_ERROR_CHECK(cudaFree(d_outfprof));
607  }
608 }
609 
626 template<typename NumericalT>
627 void FldoCuda<NumericalT>::folding(ResourceType& gpu, std::vector<util::CandidateRebin> &rebin, NumericalRep *d_in,
628  float *d_folded, float *d_weight, int *cand_phases, int isubint, float* data_mean,
629  float *data_sigma, float& total_gpu_time, std::vector<util::GpuStream>& exec_stream)
630 {
631  float gpu_time = 0.0f; // gpu execution time
632  int first_bin_idx = 0; // index of the first valid rebinning element
633  try
634  {
635  // create cuda timer events
636  util::GpuEvent gpu_timer = util::GpuEvent();
637  //TODO: control if we really need device synchronization!
638  CUDA_ERROR_CHECK(cudaDeviceSynchronize());
639  // start the kernel timer
640  gpu_timer.gpu_event_start();
641  first_bin_idx = util::first_valid_binidx(rebin);
642  //number of time sample/sub-integration
643  uint64_t nsamp_subslot = _nsamp/(_nsubints * _nchannels);
644  PANDA_LOG_DEBUG << "folding " << nsamp_subslot << " time samples"
645  << " in isubint " << isubint;
646  //corner turn the inout data to have time samples on fast-asix
647  util::corner_turner_and_rebin(gpu.device_properties(), first_bin_idx, nsamp_subslot, _nchannels, rebin, d_in);
648  // calculate the data statistics
649  {
650  float mean0 = 0; //mean of sub-integration data
651  float sigma0= 0; //sigma of sub-integration data
652 
653  //allocate a separate kernel stream to execute statistics
654  util::GpuStream sstream = util::GpuStream();
655 
656  //execute statistics when the corner turner of the input matrix has ended. Check for
657  //errors.
658  CUDA_ERROR_CHECK(cudaStreamWaitEvent(sstream.stream(), rebin[first_bin_idx].event, 0));
659 
660  //OSS: sigma0 and mean0 are the statistics of the matrix padded with 0,
661  //so these values are to be fixed
662  util::statistics_float(rebin[first_bin_idx].d_out, _nchannels * rebin[first_bin_idx].pitch_dim,
663  &mean0, &sigma0, sstream.stream());
664  PANDA_LOG_DEBUG << "mean0: "
665  << mean0
666  << " sigma0: "
667  << sigma0;
668 
669  // The allocated d_out array is memory aligned and it's padded with 0. We need to rescale
670  // the mean value to take into account the difference between the real
671  // number of data per subslot (nsamp_subslot/prebin) and the padded one (pitch_dim)
672  float meanv = mean0 * rebin[first_bin_idx].pitch_dim * rebin[first_bin_idx].rebin/nsamp_subslot;
673  *data_mean += meanv;
674 
675  // true dimension of the data for statistix: nsamp_subslot/rebin[first_bin_idx].rebin;
676  // dimension used to calculate statistix: rebin[first_bin_idx].pitch_dim;
677  float frac = ((float)rebin[first_bin_idx].pitch_dim*rebin[first_bin_idx].rebin)/nsamp_subslot;
678 
679  // calculate the variance of the true data
680  float variance_r = frac * sigma0 * sigma0 - mean0*mean0 *(frac -1) -(meanv - mean0)*(meanv-mean0);
681  //PANDA_LOG << "mean0: "
682  PANDA_LOG_DEBUG << "mean0: "
683  << mean0
684  << " sigma0: "
685  << sigma0
686  << " meanv: "
687  << meanv
688  << " variance_r: "
689  << variance_r;
690  *data_sigma += variance_r;
691  // stop the kernel timer
692  // !! here all stream are synchronized!!
693  gpu_timer.gpu_event_stop();
694  gpu_time = gpu_timer.gpu_elapsed_time();
695  total_gpu_time += gpu_time;
696  PANDA_LOG_DEBUG << "GPU matrix statistics takes "
697  << gpu_time
698  << " msec -- "
699  << "mean0: "
700  << mean0
701  << " sigma0: "
702  << sigma0;
703  }
704 
705  //go ahead for folding...
706  int valid_nbinning = first_bin_idx;
707  PANDA_LOG_DEBUG << " valid_nbinning: "
708  << valid_nbinning;
709  //loop on all valid binning values
710  // OSS: first we load all the rebin kernels and then folding one
711  for (int nbinning = valid_nbinning; nbinning < (int) fldo::cuda::max_nrebin ; nbinning ++) {
712  if (rebin[nbinning].first == -1) {
713  // check if some higher binning values has no valid entry (i.e. candidates available)
714  continue;
715  }
716  // for all the rebinning values but the first, we need to rebin the input matrix
717  if (nbinning > valid_nbinning) {
718  CUDA_ERROR_CHECK(cudaStreamWaitEvent(rebin[nbinning].stream, rebin[valid_nbinning].event, 0));
719  util::rebin_input_data(valid_nbinning, nbinning, _nchannels, rebin);
720  valid_nbinning = nbinning;
721  }
722  }
723  valid_nbinning = first_bin_idx;
724  for (int nbinning = valid_nbinning; nbinning < (int) fldo::cuda::max_nrebin ; nbinning ++) {
725  if (rebin[nbinning].first == -1) {
726  // check if some higher binning values has no valid entry (i.e. candidates available)
727  continue;
728  }
729  //executes folding for each candidate belonging to the same rebin value
730  for (int ncand = rebin[nbinning].first; ncand <= rebin[nbinning].last; ncand ++) {
731  // OSS: the kernels for each candidate are assigned to different streams
732  int nstream = ncand % exec_stream.size();
733  // for all the rebinning we need to wait the end of input data binning
734  CUDA_ERROR_CHECK(cudaStreamWaitEvent(exec_stream[nstream].stream(), rebin[nbinning].event, 0));
735  //call kernel to do folding
736  PANDA_LOG_DEBUG << "folding kernel for candidate "
737  << ncand
738  << " sub-integration: "
739  << isubint
740  << " nstream: "
741  << nstream
742  << " rebin: "
743  << rebin[nbinning].rebin
744  << " nphases: "
745  << cand_phases[ncand];
746 
747  // get the number of samps in each sub-integration
748  uint64_t nsamp_subslot = _nsamp/(_nsubints * _nchannels * rebin[nbinning].rebin);
749  util::fold_input_data(gpu.device_properties(), d_folded, d_weight, cand_phases,
750  rebin[nbinning], ncand, isubint, _nchannels, _nsubbands, nsamp_subslot,
751  _mfold_phases, _tobs, _enable_split, exec_stream[nstream].stream());
752  }
753  }
754  //check for kernel errors
755  CUDA_ERROR_CHECK(cudaGetLastError())
756  gpu_timer.gpu_event_stop();
757  gpu_time = gpu_timer.gpu_elapsed_time();
758  total_gpu_time += gpu_time;
759  //PANDA_LOG << "folding_worker takes "
760  PANDA_LOG_DEBUG << "folding_worker takes "
761  << gpu_time
762  << " msec";
763  }
764  catch (std::runtime_error &e)
765  {
766  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in folding(): " << e.what();
767  throw e;
768  }
769  catch (...)
770  {
771  PANDA_LOG_ERROR << "Caught an unknown exception in folding()";
772  throw panda::Error("Caught an unknown exception in folding()");
773  }
774 }
775 
793 template<typename NumericalT>
794 void FldoCuda<NumericalT>::profiles(size_t ncandidates,float mean, float *d_folded, float *d_weight, float *d_outfprof,
795  float* d_outprof, float& total_gpu_time, std::vector<util::GpuStream> &exec_stream)
796 {
797  try
798  {
799  //allocate timer enabled events to get the GPU execution time
800  util::GpuEvent timer = util::GpuEvent();
801 
802  float gpu_time = 0.0f; //GPU time execution
803 
804  timer.gpu_event_start();
805  util::build_scrunched_profiles(ncandidates, _mfold_phases, _nsubbands, _nsubints, mean, d_folded,
806  d_weight, d_outfprof, d_outprof, exec_stream);
807  timer.gpu_event_stop();
808  //get elapsed gup time for normalization + profile
809  gpu_time = timer.gpu_elapsed_time();
810  PANDA_LOG_DEBUG << "normalization + profile takes " << gpu_time << " ms";
811  //update the total gpu execution time
812  total_gpu_time += gpu_time;
813  }
814  catch (std::runtime_error &e)
815  {
816  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in profiles(): " << e.what();
817  throw e;
818  }
819  catch (...)
820  {
821  PANDA_LOG_ERROR << "Caught an unknown exception in profiles()";
822  throw panda::Error("Caught an unknown exception in profiles()");
823  }
824 }
825 
839 template<typename NumericalT>
840 std::shared_ptr<data::Ocld> FldoCuda<NumericalT>::run_folding(ResourceType& gpu,
841  std::vector<std::shared_ptr<TimeFrequencyType>> const& data,
842  data::Scl input_candidates)
843 {
844 
845  std::shared_ptr<data::Ocld> output = data::Ocld::make_shared();
846  // get the sampling data rate from data
847  TimeType const tsamp(data[0]->sample_interval());
848  // store the numerical value of sampling time in seconds.
849  // OSS: we don't rely on TimeType unit. If it changes, the next line always give us the value of
850  // sampling time in sec.
851  _tsamp = boost::units::quantity_cast<double>(static_cast<boost::units::quantity<boost::units::si::time>>(tsamp));
852  _tobs = (_nsamp/_nchannels) * _tsamp;
853  //get the number of candidates
854  size_t const ncandidates = input_candidates.size();
855 
856  //declare the vectors used to store data into device constatnt memory
857  std::vector <double> freq_sum(_nsubbands); // vector with the sub-band mean frequency
858  std::vector <double> time_sum(_nsubints); // vector with the sub-integration mean time
859  std::vector <double> delta_freq(_nchannels); // tabulated k_dm * 1/(v0^2 - v^2) values for all channels
860  std::vector <int> cand_phases(ncandidates); // vector with the number of phases of each candidate
861 
862  // get the array with all the frequencies
863  // NB: here we suppose that all elements of the data vector contain the
864  // same number of channels and channel frequencies.
865  // No assumption on frequency offset!
866  // We take into account only the first one array of frequencies.
867  std::vector<FrequencyType> chan_freqs = data[0]->channel_frequencies();
868  double freq_chan_1 = chan_freqs[0].value() * chan_freqs[0].value();
869  for (size_t ich = 0; ich < chan_freqs.size() ; ich ++) {
870  double freq_chan_2 = chan_freqs[ich].value() * chan_freqs[ich].value();
871  delta_freq[ich] = (fldo::k_dm * (1./freq_chan_1 - 1./freq_chan_2));
872  }
873 
874  size_t nchan_per_subband = _nchannels/_nsubbands;
875  for (size_t isband = 0; isband < _nsubbands; isband ++) {
876  for (size_t ich = 0; ich < nchan_per_subband; ich++) {
877  size_t chan_id = ich + nchan_per_subband * isband;
878  freq_sum[isband] += chan_freqs[chan_id].value();
879  }
880  freq_sum[isband]/= nchan_per_subband;
881  }
882  const size_t ts_sample = _nsamp / _nsubints / _nchannels; //time samples per sub-integration
883  for (size_t isubint = 0; isubint < _nsubints; isubint ++) {
884  time_sum[isubint] = (-0.5 * _tobs + _tsamp * isubint * ts_sample +_tsamp * (ts_sample - 1) * 0.5);
885  }
886  NumericalRep *h_in = NULL; // host memory page-locked and accessible to the device
887  NumericalRep *d_in = NULL; // device pointer to mapped memory
888  float *d_folded = NULL; // pointer to device memory with folded data
889  float *d_weight = NULL; // pointer to device memory with phases weights
890  float *d_outfprof = NULL; // pointer to device memory with profiles scrunched in freq
891  float *d_outprof = NULL; // pointer to device memory with profiles scrunched in freq and time
892  float mean = 0.; // mean of the input data
893  float sigma = 0.; // sigma of the input data
894  float total_gpu_time = 0.; // total gpu time execution
895  try {
896  std::vector <util::CandidateRebin> rebin(fldo::cuda::max_nrebin);
897  //build the vector of CandidateRebin structure. Each structure
898  //contains info about candidates with the same re-binning factor.
899  rebin_candidates(rebin, tsamp, input_candidates);
900  //set the device id before allocating device memory
901  cudaSetDevice(gpu.device_id());
902  //load the constant memory
903  load_constant_devmem(rebin, delta_freq, input_candidates, cand_phases);
904  //allocate device global memory to store input and output data
905  allocate_global_devmem(ncandidates, rebin, &d_in, &h_in, &d_outfprof, &d_outprof, &d_folded, &d_weight);
906  //create 16 streams to handle kernel concurrency during candidate
907  //processing.
908  //TODO: check the number of max stream
909  //std::vector<util::GpuStream> exec_stream(ncandidates);
910  std::vector<util::GpuStream> exec_stream(32);
911 
912  // calculate the number of time samples per sub-integration
913  int nsamp_per_subint = _nsamp/_nsubints;
914  PANDA_LOG_DEBUG << "numer of samples/subint: " << nsamp_per_subint;
915  // execute the folding on data belonging to the same sub-integration
916  // TODO: overlap data transfer and kernel execution
917  for (int isubint = 0 ; isubint < (int) _nsubints ; isubint ++) {
918  PANDA_LOG_DEBUG << "Folding sub-integration " << isubint;
919  //copy input data sub-integration in GPU memory
920  std::copy(data[isubint]->begin(), data[isubint]->end() , h_in);
921  folding(gpu, rebin, d_in, d_folded, d_weight, cand_phases.data(), isubint, &mean,
922  &sigma, total_gpu_time, exec_stream);
923  }
924 
925 #ifdef NOT_USED
926  size_t memsize = _mfold_phases * _nsubints * _nsubbands * ncandidates;
928 
929  float *h_folded = nullptr;
930  CUDA_ERROR_CHECK(cudaHostAlloc((void **) &h_folded, memsize*sizeof(float), cudaHostAllocDefault));
931  memset(h_folded, 0, memsize * sizeof(float));
932  // copy the folded gpu data to RAM
933  CUDA_ERROR_CHECK(cudaMemcpy(h_folded, d_folded, memsize * sizeof(float), cudaMemcpyDeviceToHost));
934  for (size_t ncand = 0; ncand < ncandidates; ncand ++) {
935  std::fstream file_stream;
936  std::stringstream ss;
937  ss << "folded_" << ncand << ".out";
938  //file_stream.open(panda::test::test_file(ss.str()), std::ios::out);
939  file_stream.open(ss.str(), std::ios::out);
940  for (int isubint = 0; isubint < (int)_nsubints; isubint ++) {
941  file_stream << "nsubint: " << isubint << std::endl;
942  for (int isub = 0; isub < (int)_nsubbands; isub ++) {
943  file_stream << "nsubband: " << isub << std::endl;
944  for (int nbin = 0; nbin < (int)_mfold_phases; nbin ++) {
945  if (nbin < cand_phases[ncand]) {
946  file_stream << nbin <<" " << h_folded[nbin + isub * _mfold_phases +
947  isubint * _nsubbands * _mfold_phases +
948  ncand * _nsubints * _nsubbands *_mfold_phases] << std::endl;
949  }
950  }
951  }
952  }
953  }
954 #endif
955 
956 
957  // build the reduced profile
958  profiles(ncandidates, mean, d_folded, d_weight, d_outfprof, d_outprof, total_gpu_time, exec_stream);
959  PANDA_LOG << "Total GPU time of execution (ms): " << total_gpu_time;
960 
961 #ifdef NOT_USED
962  // allocate host memory to copy profile (only for test analysis)
964  memsize = _mfold_phases * ncandidates;
965  float *h_outprof = nullptr;
966  CUDA_ERROR_CHECK(cudaHostAlloc((void **) &h_outprof, memsize*sizeof(float), cudaHostAllocDefault));
967  memset(h_outprof, 0, memsize * sizeof(float));
968  CUDA_ERROR_CHECK(cudaMemcpy(h_outprof, d_outprof, memsize * sizeof(float), cudaMemcpyDeviceToHost));
969  for (size_t ncand = 0; ncand < ncandidates; ncand ++) {
970  std::fstream file_stream;
971  std::stringstream ss;
972  ss << "reduced_profile_" << ncand << ".out";
973  //file_stream.open(panda::test::test_file(ss.str()), std::ios::out);
974  file_stream.open(ss.str(), std::ios::out);
975  for (int nbin = 0; nbin < (int)_mfold_phases; nbin ++) {
976  if (nbin < cand_phases[ncand]) {
977  file_stream << nbin <<" " << h_outprof[nbin + ncand * _mfold_phases] << std::endl;
978  }
979  }
980  file_stream.close();
981  }
982 #endif
983  // optimization algoritm: it shall return the output list of candidates
984  // copy original order candidates
985  data::Scl opt_candidates = input_candidates;
986  run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
987  rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 10, 1, 1, 1, 10, 1, sigma, 0);
988  run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
989  rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 1, 1, 10, 1, 10, 3, sigma, 0);
990  run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
991  rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 10, 3, 10, 3, 1, 1, sigma, 1);
992  // free the allocate device memory
993  free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
994  }
995  catch (std::runtime_error &e)
996  {
997  free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
998  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in run_folding(): "
999  << e.what();
1000  throw e;
1001  }
1002  catch(...)
1003  {
1004  free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
1005  PANDA_LOG_ERROR << "Caught an unknown exception in run_folding()";
1006  throw panda::Error("Caught an unknown exception in run_folding()");
1007  }
1008  return output;
1009 }
1010 
1042 template<typename NumericalT>
1043 void FldoCuda<NumericalT>::run_optimization(data::Scl const& ori_data, data::Scl & opt_data, float *d_folded,
1044  float* d_perturbed, float *d_outfprof, std::vector<util::CandidateRebin> &rebin,
1045  std::vector<double> const & time_sum, std::vector <double> const & freq_sum,
1046  float& total_gpu_time, std::vector<util::GpuStream>& exec_stream, int p_trial, int p_rf,
1047  int pdot_trial, int pdot_rf, int dm_trial, int dm_rf, float sigma, int verbose)
1048 {
1049  dim3 threadsPerBlock; // number of threads for each block
1050  dim3 blocksPerGrid; // number of blocks for each grid;
1051  util::GpuEvent gpu_timer = util::GpuEvent(); // create cuda timer events
1052 
1053  int nperiod = 0, npdot = 0, ndm = 0;
1054  nperiod = p_trial * 2 - 1; //number of period trials
1055  npdot = pdot_trial * 2 - 1; //number of pdot trials
1056  ndm = dm_trial * 2 - 1; //number of dm trials
1057  // calculate number of trials for period, pdot and dm
1058  int trials_num = nperiod * npdot * ndm; //total numer of trials for each candidate
1059  //allocate a vector to store the phase shift for each candidate and trial
1060  size_t const ncandidates = ori_data.size();
1061  size_t ph_size = ncandidates * _nsubbands * _nsubints;
1062  std::vector<float>phase_shift(ph_size);
1063  try {
1064  // start the kernel timer
1065  gpu_timer.gpu_event_start();
1066  //allocate memory to store results
1067  size_t opt_prof_size = trials_num * ncandidates * _mfold_phases;
1068  // pointer to device memory with optimized profiles scrunched in freq
1069  panda::nvidia::CudaDevicePointer<float>d_opt_profile(opt_prof_size);
1070  CUDA_ERROR_CHECK(cudaMemset(d_opt_profile(), 0, opt_prof_size * sizeof(float)));
1071  CUDA_ERROR_CHECK(cudaDeviceSynchronize());
1072  std::vector<float>trial_param(1);
1073  util::perturb_candidates(ori_data, opt_data, d_folded, d_perturbed, d_outfprof,
1074  d_opt_profile(), exec_stream, time_sum,
1075  freq_sum, trial_param, _nchannels, _nsubints, _nsubbands, _mfold_phases,
1076  _tobs, p_trial, p_rf, pdot_trial, pdot_rf, dm_trial, dm_rf);
1077  double _sigma = sqrt(sigma / _nsubints); //standard deviation of the population of raw time series
1078  int first_prebin_idx = first_valid_binidx(rebin); // find the first valid prebin index
1079  int first_prebin = rebin[first_prebin_idx].rebin; // find the first valide prebin value
1080  // Note: if the input data used to calculate the statistics have been
1081  // rebinned, we have to take into account the square root of the first
1082  // rebin value.
1083  // to get the correct sigma value, we need to scale the calculated
1084  // sigma with the first prebin value. The first prebin value defines the input
1085  // data array dimension on which the statistics is evaluated
1086  _sigma *= sqrt((float)first_prebin / _nsamp);
1087  // allocate memory to store results (sn_max and its index) for alla candidates
1088  std::vector<float> sn_max(ncandidates);
1089  std::vector<float> pulse_width(ncandidates);
1090  std::vector<int>index_max(ncandidates);
1091  // allocate memory to copy back the optimized values of 3-tuple (period, pdot, dm) for each candidate
1092  util::compute_max_sn(ncandidates, trials_num, _mfold_phases, _sigma, d_opt_profile(), sn_max, pulse_width, index_max, exec_stream);
1093  gpu_timer.gpu_event_stop();
1094  //get elapsed gup time for optimization + normalization + profile
1095  total_gpu_time += gpu_timer.gpu_elapsed_time();
1096  PANDA_LOG_DEBUG << "perturbation + S/N takes " << gpu_timer.gpu_elapsed_time() << " ms";
1097  for (size_t ncand = 0; ncand < ncandidates; ncand ++) {
1098  //get the index of optimized tuple inside the trial_param vector
1099  int idx = index_max[ncand] * 3 + ncand * trials_num * 3;
1100  data::Scl::CandidateType::MsecTimeType period_val(trial_param[idx] * boost::units::si::seconds);
1101  data::Scl::CandidateType::SecPerSecType pdot_val = trial_param[idx + 1];
1102  data::Scl::CandidateType::Dm dm_val = trial_param[idx + 2] * data::parsecs_per_cube_cm;
1103  opt_data[ncand].period(period_val);
1104  opt_data[ncand].pdot(pdot_val);
1105  opt_data[ncand].dm(dm_val);
1106  // we publish data only on last iteration
1107  if (verbose < 1) {
1108  PANDA_LOG_DEBUG << "Candidate: "<< ncand << " (" << opt_data[ncand].ident() << ")"
1109  << " index: " << index_max[ncand]
1110  << " S/N: " << sn_max[ncand]
1111  << " width: " << pulse_width[ncand]
1112  << " period: " << trial_param[idx]
1113  << " pdot: " << trial_param[idx + 1]
1114  << " dm: " << trial_param[idx + 2];
1115  } else {
1116  PANDA_LOG << "Candidate: "<< ncand << " (" << opt_data[ncand].ident() << ")"
1117  << " index: " << index_max[ncand]
1118  << " S/N: " << sn_max[ncand]
1119  << " width: " << pulse_width[ncand]
1120  << " period: " << trial_param[idx]
1121  << " pdot: " << trial_param[idx + 1]
1122  << " dm: " << trial_param[idx + 2];
1123  }
1124  }
1125  }
1126  catch (std::runtime_error &e)
1127  {
1128  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in run_optimization(): "
1129  << e.what();
1130  throw e;
1131  }
1132  catch (...)
1133  {
1134  throw panda::Error("Catch unknown exception in run_optimization()");
1135  PANDA_LOG_ERROR << "Catch unknown exception in run_optimization()";
1136  }
1137 
1138 }
1139 
1152 template<typename NumericalT>
1153 void FldoCuda<NumericalT>::data_reshape(std::vector<std::shared_ptr<TimeFrequencyType>> const& input_data,
1154  std::vector<std::shared_ptr<TimeFrequencyType>> const& data)
1155 {
1156  (void) input_data;
1157  (void) data;
1158  // TODO move here the real work
1159  //data = input_data;
1160 }
1161 
1162 template class FldoCuda<uint8_t>;
1163 //template class FldoCuda<uint16_t>;
1164 
1165 #endif //ENABLE_CUDA
1166 
1167 } // namespace cuda
1168 } // namespace fldo
1169 } // namespace cheetah
1170 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35