Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
Optimization.cu
1 
2 /*
3  * The MIT License (MIT)
4  *
5  * Copyright (c) 2016 The SKA organisation
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in all
15  * copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 #include "cheetah/fldo/cuda/detail/FldoUtils.h"
26 #include "cheetah/fldo/cuda/detail/Optimization.h"
27 #include <vector>
28 
29 
30 namespace ska {
31 namespace cheetah {
32 namespace fldo {
33 namespace cuda {
34 namespace util {
35 
36 /*
37  *
38  * void phase_shift_calc(int index, float* phase_shift, double *nu, float delta_dm, float delta_nu,
39  * float delta_nudot, double *time_sum, double
40  * *freq_sum)
41  */
59 void phase_shift_calc(int ncand, int nsubbands, int nsubints, std::vector<float> &phase_shift, double period,
60  float delta_dm, float delta_nu, float delta_nudot,
61  const double* time_sum, const double* freq_sum, int index)
62 {
63  double phshift = 0.;
64 
65  for (int iband = 0; iband < nsubbands; iband ++) {
66  double time_shift = delta_dm * (1./(freq_sum[0] * freq_sum[0]) - 1./(freq_sum[iband] * freq_sum[iband])) * 1./period;
67  for (int isub = 0; isub <nsubints; isub ++) {
68  phshift = time_shift - (delta_nu * time_sum[isub] + time_sum[isub] * time_sum[isub] * delta_nudot * 0.5);
69  if (phshift < 0) {
70  phshift += (-1.*floor(phshift));
71  }
72  phase_shift[iband + isub * nsubbands + ncand * nsubbands * nsubints] = (float)phshift;
73 
74  }
75  }
76 }
77 /*
78  *
79  * void perturb_candidates(data::Scl const& ori_data, data::Scl &opt_data,
80  * float *d_folded, float* d_perturbed, float *d_outfprof,
81  * float *d_opt_profile, std::vector<util::GpuStream>& exec_stream,
82  * const std::vector<double> const& time_sum, std::vector <double> const& freq_sum,
83  * std::vector <float> &trial_param, int nchannel,
84  * int nsubints, int nsubbands, int default_max_phases, double tobs, int p_trial, int p_rf,
85  * int pdot_trial, int pdot_rf, int dm_trial, int dm_rf)
86  *
87  * @brief Build the perturbed profile of all candidates applying small perturbations to the
88  * pulsar period, dispersion and pdot. For each candidate produce the
89  * reduced profile.
90  *
91  * @param ori_data the list with pulsar candidates original values
92  * @param opt_data the list with pulsar candidates optimized values
93  * @d_folded the array in GPU global mem with the folded data for each cabdidate
94  * @d_perturbed the array in GPU global with the perturbed
95  * profiles (one for each subband, sub integration and candidate)
96  * @d_outfprof the array in GPU global mem with the perturbed profiles in freq
97  * @d_optfprof the array in GPU global mem with the final reduced profile used
98  * during the final optimization phase.
99  * @exec_stream A vector with the GPU streams where the kernels are
100  * loaded and executedd for the different candidates.
101  * @param time_sum the array with sub-integration central time
102  * @param freq_sum the array with sub-band central freq.
103  * @trial_param vector with all the 3-tuples of pulsar parameters
104  * @param p_trial number of for period optimization
105  * @param p_rf reduce factor of the period step wiidth
106  * @param pdot_trial number of steps in pdot optimization
107  * @param pdot_rf reduce factor of the pdot step width
108  * @param dm_trial number of steps in dm optimization
109  * @param dm_rf reduce factor of the dm step width
110  *
111  * @return On failure throws a runtime_error exception.
112  */
113 void perturb_candidates(data::Scl const& ori_data, data::Scl &opt_data,
114  float *d_folded, float* d_perturbed, float *d_outfprof,
115  float *d_opt_profile, std::vector<util::GpuStream>& exec_stream,
116  std::vector<double> const& time_sum, std::vector <double> const& freq_sum,
117  std::vector <float> &trial_param, int nchannel,
118  int nsubints, int nsubbands, int default_max_phases, double tobs, int p_trial, int p_rf,
119  int pdot_trial, int pdot_rf, int dm_trial, int dm_rf)
120 
121 {
122  int nperiod = 0, npdot = 0, ndm = 0;
123  float delta_dm = 0.; // dm perturbation value
124  float delta_nu = 0.; // nu perturbation value
125  float delta_nudot = 0.; // nudot perturbation value
126  float period_range= 0.000202e-3; // period range and step for optimization phase
127  float period_step = 0.00002e-3; // period step
128  float pdot_range = 0.0202e-6; // pdot range and step for optimization phase
129  float pdot_step = 0.02e-7; // pdot step
130  float dm_range = 1.0; // dm range and step for optimization phase
131  float dm_step = 0.10; // dm step
132  float trial_period = 0., trial_pdot = 0., trial_dm = 0.;
133  dim3 threadsPerBlock; //number of threads for each block
134  dim3 blocksPerGrid;
135 
136 
137  // calculate number of trials for period, pdot and dm
138  nperiod = p_trial * 2 - 1;
139  npdot = pdot_trial * 2 - 1;
140  ndm = dm_trial * 2 - 1;
141 
142  //allocate a vector to store the phase shift for each candidate and trial
143  size_t const ncandidates = ori_data.size();
144  int trials_num = nperiod * npdot * ndm; //total numer of trials for each candidate
145  size_t ph_size = ncandidates * nsubbands * nsubints;
146  std::vector<float>phase_shift(ph_size);
147  //Note: the size of the vector is = (trials_num * 3) because for each
148  //trial we register the trial period, the trial dm and trial pdot.
149  //std::vector<float>trial_param(3 * trials_num * ncandidates);
150  trial_param.resize(3 * trials_num * ncandidates);
151  int index = 0;
152 
153  // pointer to device memory to store phase shifts in optimization phase
154  // d_phase_shft is used only by this routine, so we allocate it here.
155  // It is automatically released at routine exiting.
156  panda::nvidia::CudaDevicePointer<float> d_phase_shift(ph_size);
157  //Oss: device memory allocated with cudaMalloc is not cleared
158  CUDA_ERROR_CHECK(cudaMemset((void*)d_phase_shift(), 0, ph_size* sizeof(float)));
159 
160  try {
161  for (int ip = 0; ip < nperiod; ip++) {
162  PANDA_LOG_DEBUG << " iteration " << ip << "\\" << nperiod;
163  for (int ipdot = 0; ipdot < npdot; ipdot++) {
164  for (int idm = 0; idm < ndm; idm++) {
165  for (size_t ncand = 0; ncand < ncandidates ; ncand ++) {
166  double const folded_cand_period =
167  boost::units::quantity_cast<double>
168  (static_cast<boost::units::quantity<boost::units::si::time>>((ori_data[ncand].period())));
169  double opt_cand_period =
170  boost::units::quantity_cast<double>
171  (static_cast<boost::units::quantity<boost::units::si::time>>((opt_data[ncand].period())));
172  period_step = folded_cand_period/tobs/5000.;
173  if (p_rf > 1) {
174  period_step /= p_rf;
175  }
176  // to mantain the same number of optimization points, we update the period_range
177  // (not used hereafter)
178  period_range = p_trial * period_step;
179  double const folded_cand_pdot = (ori_data[ncand].pdot()).value();
180  double opt_cand_pdot = (opt_data[ncand].pdot()).value();
181 
182  pdot_step = folded_cand_period/tobs/tobs/1000 * 2;
183  if (pdot_rf > 1) {
184  pdot_step /= pdot_rf;
185  }
186  // to mantain the same number of optimization points, we update the pdot_range
187  // (not used hereafter)
188  pdot_range = pdot_trial * pdot_step;
189 
190  float const folded_cand_dm = (ori_data[ncand].dm()).value();
191  float opt_cand_dm = (opt_data[ncand].dm()).value();
192  dm_step = folded_cand_period * 90;
193  // we report limits from Story AT4-18
194  dm_step = 80. * folded_cand_period * (folded_cand_period/1000.) ;
195  dm_step = ((dm_step > .5) ? .5: dm_step);
196  if (dm_step > .5)
197  dm_step = 0.5;
198  dm_step = ((dm_step < .001) ? .001: dm_step);
199  if (dm_step < .001)
200  dm_step = .001;
201  // to mantain the same number of optimization points, we update the dm_range
202  // (not used hereafter)
203  dm_range = dm_trial * dm_step;
204  if (dm_rf > 1) {
205  dm_step /= dm_rf;
206  }
207  // w
208  PANDA_LOG_DEBUG << " ranges " << period_range << pdot_range << dm_range;
209 
210  // the index into the p/dm and p/pdot planes. note that we use result->nperiod not n_p
211  // because of the factor of 2 above.
212  trial_period = (ip - p_trial + 1) * period_step + opt_cand_period;
213  trial_pdot = (ipdot - pdot_trial + 1) * pdot_step + opt_cand_pdot;
214  trial_dm = (idm - dm_trial + 1) * dm_step + opt_cand_dm;
215  delta_dm = (folded_cand_dm - trial_dm) * fldo::k_dm;
216  delta_nu = (1.0/trial_period) - 1./folded_cand_period;
217  delta_nudot = -trial_pdot /(trial_period * trial_period) +
218  folded_cand_pdot/(folded_cand_period*folded_cand_period);
219  phase_shift_calc(ncand, nsubbands, nsubints, phase_shift, opt_cand_period, delta_dm, delta_nu,
220  delta_nudot, time_sum.data(), freq_sum.data() ,index);
221  int shift = ncand * nsubbands * nsubints;
222  int nstream = ncand % exec_stream.size();
223  CUDA_ERROR_CHECK(cudaMemcpyAsync(d_phase_shift() + shift, &phase_shift[shift],
224  nsubbands * nsubints* sizeof(float), cudaMemcpyHostToDevice,
225  exec_stream[nstream].stream()));
226  //CUDA_ERROR_CHECK(cudaGetLastError());
227  //configure the perturbed kernel parameters
228  threadsPerBlock.x = 32; // threadIdx.x -> phase bin
229  threadsPerBlock.y = 4; // threadIdx.y -> subband
230  //small speed up but results screwed up
231  blocksPerGrid.x = nsubbands/threadsPerBlock.y; // one block per subband
232  blocksPerGrid.y = nsubints; // one block per subints
233  size_t shared_memsize = default_max_phases * sizeof(float) *threadsPerBlock.y;
234  // rotate the profiles to account for the small change in p/pdot/dm.
235  perturbate_kernel<<< blocksPerGrid, threadsPerBlock, shared_memsize,
236  exec_stream[nstream].stream() >>> (d_folded,
237  d_perturbed,
238  d_phase_shift(),
239  ncand,
240  default_max_phases,
241  index);
242  CUDA_ERROR_CHECK(cudaGetLastError());
243 
244  //reduce kernel to build the profile in freq.
245  threadsPerBlock.x = default_max_phases;
246  threadsPerBlock.y = 1;
247  blocksPerGrid.x = nsubints;
248  blocksPerGrid.y = 1;
249  profile_kernel<<< blocksPerGrid, threadsPerBlock, 0,
250  exec_stream[nstream].stream() >>> (d_perturbed,
251  d_outfprof,
252  ncand,
253  nsubbands);
254  CUDA_ERROR_CHECK(cudaGetLastError());
255  threadsPerBlock.x = default_max_phases;
256  threadsPerBlock.y = 1;
257  blocksPerGrid.x = 1; // not better if fold.nsubints;
258  blocksPerGrid.y = 1;
259  //reduce kernel to build the final profile
260  best_profile_kernel<<< blocksPerGrid, threadsPerBlock, 0,
261  exec_stream[nstream].stream()>>>(d_outfprof,
262  d_opt_profile,
263  ncand,
264  nsubints,
265  index,
266  trials_num);
267  CUDA_ERROR_CHECK(cudaGetLastError());
268  //store the trial values into the allocated array
269  //
270  // arrangment of data in trial_param array
271  // --------------------------------------------------------------------------------
272  // |period0| pdot0| dm0 | period0 | pdot0 | dm1 | .........|periodN | pdotM | dmR |
273  // -------------------------------------------------------------------------------
274  // index 0 | index 1 | .........| index N*M*R |
275  // <-------------------------------cand 0----------------------------------------->
276  // This is repeated for each candidate, so the final array contains al the trial parameters for all
277  // the candidates
278  trial_param[index * 3 + ncand * trials_num * 3] = trial_period;
279  trial_param[index * 3 + ncand * trials_num * 3 + 1] = trial_pdot;
280  trial_param[index * 3 + ncand * trials_num * 3 + 2] = trial_dm;
281  } //end loop on candidates
282  CUDA_ERROR_CHECK(cudaDeviceSynchronize());
283  index ++;
284  } // end loop on dm perturbations
285  } // end loop on pdot perturbations
286  } // end loop on period perturbations
287  }
288  catch (std::runtime_error &e)
289  {
290  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in perturb_candidates(): "
291  << e.what();
292  throw e;
293  }
294  catch (...)
295  {
296  PANDA_LOG_ERROR << "Catch unknown exception in perturb_candidates()";
297  throw panda::Error("Catch unknown exception in perturb_candidates()");
298  }
299 }
300 
301 
302 /*
303  *
304  * void compute_max_sn(size_t const ncandidates, int trials_num, int default_max_phases, float sigma, float
305  * *d_opt_profile, std::vector<float> &sn_max, std::vector<float> &pulse_width, std::vector<int> &index_max,
306  * std::vector<util::GpuStream>& exec_stream)
307  * @brief Evaluates the best S/N value for each pulsar candidate.
308  *
309  * @param ncandidates the total number of pulsar candidates
310  * @trials_num the number of trials done by the optimization procedure
311  * @default_max_phases the max number of phase bins
312  * @sigma the standard deviation value of the processed data
313  * @d_opt_profile the array (in GPU memory) with the optimized
314  * schrunched profiles (in freq and time) of all the candidates
315  * @sn_max A vectory with the best S/N values for each candidate
316  * @pulse_width A vectory with the pulse width (corresponding to the * best S/N value) for each candidate
317  * @index_max A vectory with the index of the 3-tupla of trial params (period, pdot and dm) that for
318  * each candidate correspnds the best S/N value
319  * @exec_stream A vector with the GPU streams where the kernels are
320  * loaded and executedd for the different candidates.
321  * @return On failure throws a runtime_error exception.
322 */
323 void compute_max_sn(size_t const ncandidates, int trials_num, int default_max_phases, float sigma, float
324  *d_opt_profile, std::vector<float> &sn_max, std::vector<float> &pulse_width, std::vector<int> &index_max,
325  std::vector<util::GpuStream>& exec_stream)
326 {
327  try
328  {
329  size_t opt_value_size = 2 * ncandidates *trials_num;
330  // pointer to the optimized values of all candidates
331  panda::nvidia::CudaDevicePointer<float>d_opt_value(opt_value_size);
332  CUDA_ERROR_CHECK(cudaMemset(d_opt_value(), 0, opt_value_size * sizeof(float)));
333  int nblock_y = 1;
334  int nthread_x = 32;
335  if (trials_num > nthread_x) {
336  nblock_y = trials_num/nthread_x + 1;
337  }
338  dim3 threadsPerBlock; //number of threads for each block
339  dim3 blocksPerGrid;
340  threadsPerBlock.x = nthread_x;
341  threadsPerBlock.y = 1;
342  blocksPerGrid.x = nblock_y;
343  //trials_num * 2 -> for each trial, we store the S/N of the profile and width value
344  int shared_memsize = nthread_x * 2 * sizeof(float);
345  // to get the correct sigma value, we need to scale the calculated
346  // sigma with the first prebin value. The first prebin value defines the input
347  // data array dimension on which the statistics is evaluated
348  for (size_t ncand = 0; ncand < ncandidates; ncand ++) {
349  int nstream = ncand % exec_stream.size();
350  compute_sn<<<blocksPerGrid, threadsPerBlock, shared_memsize, exec_stream[nstream].stream()
351  >>> (d_opt_profile,
352  d_opt_value(),
353  default_max_phases,
354  ncand,
355  trials_num,
356  sigma);
357  CUDA_ERROR_CHECK(cudaGetLastError());
358  // Warning!!! Last value should read sigma*sqrt((float) (nbins[ncand] * first_prebin))
359  // but here we do not have nbins[], so we make computation inside compute_sn
360  }
361 
362  //
363  // arrangment of data stored in d_opt_value (GPU mem) and h_opt_value (RAM)
364  //
365  // -------------------------------------------------------------------|
366  // sn_0 |sn_1 | sn_2| .......|sn_Ntrial| w_0|w_1| w_2|......|w_Ntrial |
367  // ____________________________________|______________________________|
368  // <---- N trial elements -->|<------ N trial elements ->|
369  //<------------------------------ cand 0 ----------------------------->
370  //
371 
372  // d_opt_value and h_opt_value have 2 * N_trial elements for each pulsar candidate
373  // d_opt_value is passed to the thrust routine to find the S/N max value and its corresponding
374  // array index.The positionof the max S/N corresponds to the index of the 3-tuple stored into trial_param array.
375  //
376  for (int ncand = 0; ncand < ncandidates; ncand ++) {
377  // start is the index of the data relative to the ncand candidate
378  // find max returns the position of the max S/N value relative to start
379  int start = ncand * trials_num * 2 ;
380  util::find_max(d_opt_value(), start, trials_num, sn_max[ncand], pulse_width[ncand], index_max[ncand]);
381  PANDA_LOG_DEBUG << "Candidate: " << ncand
382  << " sn_max: " << sn_max[ncand]
383  << " index: " << index_max[ncand];
384  }
385  }
386  catch (std::runtime_error &e)
387  {
388  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in compute_max_sn(): "
389  << e.what();
390  throw e;
391  }
392  catch (...)
393  {
394  PANDA_LOG_ERROR << "Catch unknown exception in compute_max_sn()";
395  throw panda::Error("Catch unknown exception in compute_max_sn()");
396  }
397 
398 }
399 /*
400  * int first_valid_binidx(std::vector<util::CandidateRebin> &rebin)
401  *
402  * @brief utility function to get the first valid rebinning value, i.e. the
403  * first entry in the rebin vector with a number of candidate > 0
404  *
405  * @rebin the input rebin vector
406  * @return the element index
407  */
408 int first_valid_binidx(std::vector<util::CandidateRebin> &rebin)
409 {
410  int index = 0;
411  for (std::vector<util::CandidateRebin>::iterator it = rebin.begin(); it != rebin.end(); ++it) {
412  if ((*it).first > -1) {
413  //get the index of the iterator
414  //return std::distance(it, rebin.begin());
415  return index;
416  }
417  index++;
418  }
419  return -1;
420 }
421 
422 
423 } // utils
424 } // namespace cuda
425 } // namespace fldo
426 } // namespace cheetah
427 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35