25 #include "cheetah/fldo/cuda/detail/FldoUtils.h" 26 #include "cheetah/fldo/cuda/detail/Optimization.h" 59 void phase_shift_calc(
int ncand,
int nsubbands,
int nsubints, std::vector<float> &phase_shift,
double period,
60 float delta_dm,
float delta_nu,
float delta_nudot,
61 const double* time_sum,
const double* freq_sum,
int index)
65 for (
int iband = 0; iband < nsubbands; iband ++) {
66 double time_shift = delta_dm * (1./(freq_sum[0] * freq_sum[0]) - 1./(freq_sum[iband] * freq_sum[iband])) * 1./period;
67 for (
int isub = 0; isub <nsubints; isub ++) {
68 phshift = time_shift - (delta_nu * time_sum[isub] + time_sum[isub] * time_sum[isub] * delta_nudot * 0.5);
70 phshift += (-1.*floor(phshift));
72 phase_shift[iband + isub * nsubbands + ncand * nsubbands * nsubints] = (float)phshift;
113 void perturb_candidates(data::Scl
const& ori_data, data::Scl &opt_data,
114 float *d_folded,
float* d_perturbed,
float *d_outfprof,
115 float *d_opt_profile, std::vector<util::GpuStream>& exec_stream,
116 std::vector<double>
const& time_sum, std::vector <double>
const& freq_sum,
117 std::vector <float> &trial_param,
int nchannel,
118 int nsubints,
int nsubbands,
int default_max_phases,
double tobs,
int p_trial,
int p_rf,
119 int pdot_trial,
int pdot_rf,
int dm_trial,
int dm_rf)
122 int nperiod = 0, npdot = 0, ndm = 0;
125 float delta_nudot = 0.;
126 float period_range= 0.000202e-3;
127 float period_step = 0.00002e-3;
128 float pdot_range = 0.0202e-6;
129 float pdot_step = 0.02e-7;
130 float dm_range = 1.0;
131 float dm_step = 0.10;
132 float trial_period = 0., trial_pdot = 0., trial_dm = 0.;
133 dim3 threadsPerBlock;
138 nperiod = p_trial * 2 - 1;
139 npdot = pdot_trial * 2 - 1;
140 ndm = dm_trial * 2 - 1;
143 size_t const ncandidates = ori_data.size();
144 int trials_num = nperiod * npdot * ndm;
145 size_t ph_size = ncandidates * nsubbands * nsubints;
146 std::vector<float>phase_shift(ph_size);
150 trial_param.resize(3 * trials_num * ncandidates);
156 panda::nvidia::CudaDevicePointer<float> d_phase_shift(ph_size);
158 CUDA_ERROR_CHECK(cudaMemset((
void*)d_phase_shift(), 0, ph_size*
sizeof(
float)));
161 for (
int ip = 0; ip < nperiod; ip++) {
162 PANDA_LOG_DEBUG <<
" iteration " << ip <<
"\\" << nperiod;
163 for (
int ipdot = 0; ipdot < npdot; ipdot++) {
164 for (
int idm = 0; idm < ndm; idm++) {
165 for (
size_t ncand = 0; ncand < ncandidates ; ncand ++) {
166 double const folded_cand_period =
167 boost::units::quantity_cast<
double>
168 (
static_cast<boost::units::quantity<boost::units::si::time>
>((ori_data[ncand].period())));
169 double opt_cand_period =
170 boost::units::quantity_cast<
double>
171 (
static_cast<boost::units::quantity<boost::units::si::time>
>((opt_data[ncand].period())));
172 period_step = folded_cand_period/tobs/5000.;
178 period_range = p_trial * period_step;
179 double const folded_cand_pdot = (ori_data[ncand].pdot()).value();
180 double opt_cand_pdot = (opt_data[ncand].pdot()).value();
182 pdot_step = folded_cand_period/tobs/tobs/1000 * 2;
184 pdot_step /= pdot_rf;
188 pdot_range = pdot_trial * pdot_step;
190 float const folded_cand_dm = (ori_data[ncand].dm()).value();
191 float opt_cand_dm = (opt_data[ncand].dm()).value();
192 dm_step = folded_cand_period * 90;
194 dm_step = 80. * folded_cand_period * (folded_cand_period/1000.) ;
195 dm_step = ((dm_step > .5) ? .5: dm_step);
198 dm_step = ((dm_step < .001) ? .001: dm_step);
203 dm_range = dm_trial * dm_step;
208 PANDA_LOG_DEBUG <<
" ranges " << period_range << pdot_range << dm_range;
212 trial_period = (ip - p_trial + 1) * period_step + opt_cand_period;
213 trial_pdot = (ipdot - pdot_trial + 1) * pdot_step + opt_cand_pdot;
214 trial_dm = (idm - dm_trial + 1) * dm_step + opt_cand_dm;
215 delta_dm = (folded_cand_dm - trial_dm) * fldo::k_dm;
216 delta_nu = (1.0/trial_period) - 1./folded_cand_period;
217 delta_nudot = -trial_pdot /(trial_period * trial_period) +
218 folded_cand_pdot/(folded_cand_period*folded_cand_period);
219 phase_shift_calc(ncand, nsubbands, nsubints, phase_shift, opt_cand_period, delta_dm, delta_nu,
220 delta_nudot, time_sum.data(), freq_sum.data() ,index);
221 int shift = ncand * nsubbands * nsubints;
222 int nstream = ncand % exec_stream.size();
223 CUDA_ERROR_CHECK(cudaMemcpyAsync(d_phase_shift() + shift, &phase_shift[shift],
224 nsubbands * nsubints*
sizeof(
float), cudaMemcpyHostToDevice,
225 exec_stream[nstream].stream()));
228 threadsPerBlock.x = 32;
229 threadsPerBlock.y = 4;
231 blocksPerGrid.x = nsubbands/threadsPerBlock.y;
232 blocksPerGrid.y = nsubints;
233 size_t shared_memsize = default_max_phases *
sizeof(float) *threadsPerBlock.y;
235 perturbate_kernel<<< blocksPerGrid, threadsPerBlock, shared_memsize,
236 exec_stream[nstream].stream() >>> (d_folded,
242 CUDA_ERROR_CHECK(cudaGetLastError());
245 threadsPerBlock.x = default_max_phases;
246 threadsPerBlock.y = 1;
247 blocksPerGrid.x = nsubints;
249 profile_kernel<<< blocksPerGrid, threadsPerBlock, 0,
250 exec_stream[nstream].stream() >>> (d_perturbed,
254 CUDA_ERROR_CHECK(cudaGetLastError());
255 threadsPerBlock.x = default_max_phases;
256 threadsPerBlock.y = 1;
260 best_profile_kernel<<< blocksPerGrid, threadsPerBlock, 0,
261 exec_stream[nstream].stream()>>>(d_outfprof,
267 CUDA_ERROR_CHECK(cudaGetLastError());
278 trial_param[index * 3 + ncand * trials_num * 3] = trial_period;
279 trial_param[index * 3 + ncand * trials_num * 3 + 1] = trial_pdot;
280 trial_param[index * 3 + ncand * trials_num * 3 + 2] = trial_dm;
282 CUDA_ERROR_CHECK(cudaDeviceSynchronize());
288 catch (std::runtime_error &e)
290 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in perturb_candidates(): " 296 PANDA_LOG_ERROR <<
"Catch unknown exception in perturb_candidates()";
297 throw panda::Error(
"Catch unknown exception in perturb_candidates()");
323 void compute_max_sn(
size_t const ncandidates,
int trials_num,
int default_max_phases,
float sigma,
float 324 *d_opt_profile, std::vector<float> &sn_max, std::vector<float> &pulse_width, std::vector<int> &index_max,
325 std::vector<util::GpuStream>& exec_stream)
329 size_t opt_value_size = 2 * ncandidates *trials_num;
331 panda::nvidia::CudaDevicePointer<float>d_opt_value(opt_value_size);
332 CUDA_ERROR_CHECK(cudaMemset(d_opt_value(), 0, opt_value_size *
sizeof(
float)));
335 if (trials_num > nthread_x) {
336 nblock_y = trials_num/nthread_x + 1;
338 dim3 threadsPerBlock;
340 threadsPerBlock.x = nthread_x;
341 threadsPerBlock.y = 1;
342 blocksPerGrid.x = nblock_y;
344 int shared_memsize = nthread_x * 2 *
sizeof(float);
348 for (
size_t ncand = 0; ncand < ncandidates; ncand ++) {
349 int nstream = ncand % exec_stream.size();
350 compute_sn<<<blocksPerGrid, threadsPerBlock, shared_memsize, exec_stream[nstream].stream()
357 CUDA_ERROR_CHECK(cudaGetLastError());
376 for (
int ncand = 0; ncand < ncandidates; ncand ++) {
379 int start = ncand * trials_num * 2 ;
380 util::find_max(d_opt_value(), start, trials_num, sn_max[ncand], pulse_width[ncand], index_max[ncand]);
381 PANDA_LOG_DEBUG <<
"Candidate: " << ncand
382 <<
" sn_max: " << sn_max[ncand]
383 <<
" index: " << index_max[ncand];
386 catch (std::runtime_error &e)
388 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in compute_max_sn(): " 394 PANDA_LOG_ERROR <<
"Catch unknown exception in compute_max_sn()";
395 throw panda::Error(
"Catch unknown exception in compute_max_sn()");
408 int first_valid_binidx(std::vector<util::CandidateRebin> &rebin)
411 for (std::vector<util::CandidateRebin>::iterator it = rebin.begin(); it != rebin.end(); ++it) {
412 if ((*it).first > -1) {
Some limits and constants for FLDO.