24 #include "cheetah/fldo/cuda/Fldo.h" 25 #include "cheetah/fldo/cuda/Config.h" 26 #include "cheetah/fldo/cuda/detail/FldoCuda.h" 27 #include "cheetah/fldo/cuda/detail/Optimization.h" 28 #include "cheetah/cuda_utils/cuda_errorhandling.h" 39 template<
typename NumericalT>
40 FldoCuda<NumericalT>::FldoCuda(fldo::Config
const& config)
44 fldo::cuda::Config
const& cuda_config = _config.cuda_algo_config();
47 _mfold_phases = cuda_config.phases();
48 _nsubbands = cuda_config.nsubbands();
49 _nsubints = cuda_config.nsubints();
51 _enable_split = cuda_config.enable_split();
54 template<
typename NumericalT>
55 std::shared_ptr<data::Ocld> FldoCuda<NumericalT>::operator()(ResourceType& gpu,
56 std::vector<std::shared_ptr<TimeFrequencyType>>
const& input_data,
57 data::Scl
const& candidates_list)
61 std::vector<std::shared_ptr<TimeFrequencyType>> data;
62 data::Scl input_candidates = candidates_list;
65 if ((0 == input_data.size()) || (0 == input_data[0]->number_of_spectra() )) {
66 PANDA_LOG_ERROR <<
"Fldo: data buffer size == 0";
67 return data::Ocld::make_shared();
71 PANDA_LOG_DEBUG <<
"Fldo: Total candidates number == " << input_candidates.size();
72 PANDA_LOG <<
"Fldo: Total candidates number == " << input_candidates.size();
75 if (0 == input_candidates.size() ) {
76 PANDA_LOG_ERROR <<
"Fldo: candidate buffer size == 0";
77 return data::Ocld::make_shared();
90 uint64_t totalsize = 0;
91 uint64_t minsize = ULLONG_MAX;
93 for (
int i = 0; i < (int) input_data.size(); i++) {
94 totalsize += input_data[i]->number_of_spectra();
97 if (minsize > input_data[i]->data_size()){
98 minsize = input_data[i]->data_size();
100 if (maxsize < input_data[i]->data_size()) {
101 maxsize = input_data[i]->data_size();
107 _nchannels = input_data[0]->number_of_channels();
108 if(_nchannels < fldo::cuda::min_frequencies || _nchannels > fldo::cuda::max_frequencies) {
109 PANDA_LOG_ERROR <<
"number of channels outside range for this algorithm (" 110 << min_frequencies <<
", " << max_frequencies <<
"): " << _nchannels;
111 return data::Ocld::make_shared();
115 uint64_t nsamples = totalsize * _nchannels;
118 if ((nsamples % (_nchannels * _nsubints * fldo::cuda::max_prebin)) != 0) {
119 nsamples = (uint64_t)((nsamples/(((
float)_nsubints *_nchannels) * fldo::cuda::max_prebin)))
120 * (_nsubints*_nchannels * fldo::cuda::max_prebin);
122 PANDA_LOG_ERROR <<
"Not enough data to process at max rebin value" << fldo::cuda::max_prebin;
123 return data::Ocld::make_shared();
130 if ((maxsize == minsize) && (input_data.size() == _nsubints) && (_nsamp/(_nsubints) == minsize)) {
131 PANDA_LOG_DEBUG <<
"Fldo: input data already formatted (" << totalsize <<
" measures)";
136 PANDA_LOG_DEBUG <<
"Fldo: input data not correctly formatted (" << totalsize <<
" measures)";
138 const size_t measures = _nsamp / _nsubints / _nchannels;
139 PANDA_LOG_DEBUG <<
"measures: " << measures;
141 const size_t channels = _nchannels;
143 int input_insideind = 0;
144 size_t destination_start = 0;
145 for (
int i = 0; i < (int) _nsubints; i++) {
148 data.emplace_back(
new TimeFrequencyType(data::DimensionSize<data::Time>(measures), data::DimensionSize<data::Frequency>(channels)));
152 time_t miot = input_data[0]->start_time().to_time_t();
153 miot += (measures*i*(input_data[0]->sample_interval().value()));
154 data[i]->start_time( miot);
155 data[i]->sample_interval(input_data[input_ind]->sample_interval());
156 data[i]->set_channel_frequencies(input_data[input_ind]->channel_frequencies().begin(),
157 input_data[input_ind]->channel_frequencies().end());
165 size_t left = measures * channels;
167 size_t data_size = (input_data[input_ind]->end() - (input_data[input_ind]->begin() + input_insideind ));
168 if (left >= data_size) {
172 std::copy((input_data[input_ind]->begin() + input_insideind),
173 (input_data[input_ind]->begin() + input_insideind + data_size),
174 (data[i]->begin() + destination_start));
177 if (left == data_size) {
178 destination_start = 0;
180 destination_start += data_size;
188 std::copy ( (input_data[input_ind]->begin() + input_insideind),
189 (input_data[input_ind]->begin() + input_insideind + left),
190 (data[i]->begin() + destination_start));
191 destination_start = 0;
192 input_insideind += left;
201 uint64_t totalsize = 0;
202 for (
int i = 0; i < (int) data.size(); ++i) {
203 totalsize += data[i]->number_of_spectra();
205 PANDA_LOG <<
"Fldo: Total number of time samples " << totalsize
206 <<
" in " << data.size() <<
" chunks" 207 <<
" each with " << data[0]->number_of_channels()
208 <<
" frequency channels ";
210 return run_folding(gpu, data, input_candidates);
232 template<
typename NumericalT>
233 void FldoCuda<NumericalT>::rebin_candidates(std::vector<util::CandidateRebin> &rebin, TimeType
const tsamp, data::Scl& scl_data)
235 PANDA_LOG_DEBUG <<
"Call rebin_candidates";
241 std::sort (scl_data.begin(), scl_data.end(), [](CandidateType first, CandidateType second) ->
bool {
242 if (first.period() < second.period()) {
244 }
else if (first.period() > second.period()) {
247 if (first.dm() < second.dm()) {
249 }
else if (first.dm() > second.dm()) {
252 if (first.pdot() < second.pdot()) {
254 }
else if (first.pdot() > second.pdot()) {
262 for (data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin() ;
263 it != scl_data.end(); ++it) {
264 PANDA_LOG_DEBUG <<
"Sorted candidate " 273 float rebin_period[fldo::cuda::max_nrebin];
283 int max_num_of_phases = _mfold_phases - 2;
286 if (_enable_split ==
false) {
287 max_num_of_phases = _mfold_phases - 1;
289 float max_period = max_num_of_phases * tsamp.value() * fldo::cuda::max_prebin * 0.5;
293 for (
int index = 0; index < (int) fldo::cuda::max_nrebin - 1; index ++) {
294 rebin_period[index] = max_num_of_phases * (1 << index) * tsamp.value() * 0.5;
295 PANDA_LOG_DEBUG <<
"period[ " << index <<
"]: " << rebin_period[index];
297 rebin_period[fldo::cuda::max_nrebin - 1] = max_period;
300 data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin() ;
306 for (
int rebin_idx = 0; rebin_idx < (int) fldo::cuda::max_nrebin; rebin_idx ++) {
308 rebin[rebin_idx].rebin = 1 << rebin_idx;
313 while (it != scl_data.end()) {
315 CandidateType::TimeType
const cand_period ((*it).period());
319 float const cand_period_value = boost::units::quantity_cast<
float>(
static_cast<boost::units::quantity<boost::units::si::time>
>(cand_period));
320 if (cand_period_value >= rebin_period[rebin_idx]) {
323 if (rebin[rebin_idx].first > -1) {
324 rebin[rebin_idx].last ++;
326 rebin[rebin_idx].rebin = 1 << rebin_idx;
327 rebin[rebin_idx].first = cand_idx;
328 rebin[rebin_idx].last = cand_idx;
337 for (
int index = 0; index < (int) fldo::cuda::max_nrebin; index ++) {
338 PANDA_LOG_DEBUG <<
"rebin: " << rebin[index].rebin
339 <<
" first: " << rebin[index].first
340 <<
" last: " << rebin[index].last;
359 template<
typename NumericalT>
360 void FldoCuda<NumericalT>::load_constant_devmem(std::vector<util::CandidateRebin> &rebin, std::vector<double> delta_freq,
361 data::Scl
const &scl_data, std::vector <int> &cand_phases)
364 size_t const ncandidates = scl_data.size();
365 std::vector <float> dm(ncandidates);
366 std::vector <double> period(ncandidates);
367 std::vector <double> pdot(ncandidates);
368 std::vector <double> nu(ncandidates);
369 std::vector <double> nudot(ncandidates);
372 size_t const nchan_per_subband = _nchannels/_nsubbands;
376 for (data::VectorLike<std::vector<data::Scl::CandidateType>>::ConstIterator it = scl_data.begin();
377 it != scl_data.end(); ++it) {
381 double const cand_period_value = boost::units::quantity_cast<
double>(
static_cast<boost::units::quantity<boost::units::si::time>
>((*it).period()));
382 double const cand_pdot = ((*it).pdot()).value();
383 period[index] = cand_period_value;
384 pdot[index] = cand_pdot;
385 dm[index] = ((*it).dm()).value();
386 nudot[index] = -cand_pdot/(cand_period_value * cand_period_value);
387 nu[index] = 1./cand_period_value;
390 PANDA_LOG_DEBUG <<
"Arrays of candidates values completed";
394 for (
int i = 0; i < (int) fldo::cuda::max_nrebin; ++i) {
395 if ((rebin[i].rebin == 1 << i) && (rebin[i].first > -1)) {
398 double period_val = 0.;
399 for (
int ncand = rebin[i].first; ncand <= rebin[i].last; ncand ++) {
401 period_val = period[ncand];
402 cand_phases[ncand] = 2 * ((int) (period[ncand] / (_tsamp * rebin[i].rebin)));
403 PANDA_LOG_DEBUG <<
"load_constant_devmem: cand_phases[" 406 << cand_phases[ncand];
409 if ((cand_phases[ncand] > (
int)_mfold_phases - 1) && (_enable_split ==
false)) {
410 cand_phases[ncand] = (int)_mfold_phases - 1;
412 PANDA_LOG_DEBUG <<
" Candidate: " << ncand
413 <<
" original cand_phases: " << cand_phases[ncand]
414 <<
" period: " << period[ncand]
415 <<
" dm: " << dm[ncand]
416 <<
" pdot: " << pdot[ncand]
417 <<
" rebin: " << rebin[i].rebin;
418 nudot[ncand] = -pdot[ncand]/(period_val * period_val);
419 nu[ncand] = 1./period_val;
423 util::load_constant_data(delta_freq.data(), nu.data(), nudot.data(), dm.data(),
424 cand_phases.data(), _nchannels, nchan_per_subband, _nsubints, _tsamp, ncandidates);
426 catch (std::runtime_error &e)
428 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in load_constant_mem(): " 434 throw panda::Error(
"Catch unknown exception in load_constant_mem()");
435 PANDA_LOG_ERROR <<
"Catch unknown exception in load_constant_mem()";
458 template<
typename NumericalT>
459 void FldoCuda<NumericalT>::allocate_global_devmem(
size_t ncandidates, std::vector<util::CandidateRebin> &rebin,
460 NumericalRep **d_in, NumericalRep ** h_in,
float **d_outfprof,
461 float ** d_outprof,
float ** d_folded,
float ** d_weight)
466 uint64_t nsamp_per_subint = _nsamp/_nsubints;
467 int nrow = nsamp_per_subint / _nchannels;
468 size_t pitch_size = 0;
471 nrow = nsamp_per_subint / _nchannels;
474 CUDA_ERROR_CHECK(cudaMallocPitch((
void **)&(rebin[0].d_out), &pitch_size, nrow *
sizeof(
float), _nchannels));
476 rebin[0].msize = pitch_size * _nchannels;
477 CUDA_ERROR_CHECK(cudaMemset((
void*)rebin[0].d_out, 0, rebin[0].msize));
479 rebin[0].pitch_dim = pitch_size/
sizeof(float);
483 for (std::vector<util::CandidateRebin>::iterator it = rebin.begin() + 1; it != rebin.end(); ++it) {
484 if ((*it).first > -1) {
485 nrow = nsamp_per_subint / (_nchannels * (*it).rebin);
486 CUDA_ERROR_CHECK(cudaMallocPitch((
void **)&(*it).d_out, &pitch_size, nrow *
sizeof(
float), _nchannels));
488 (*it).msize = pitch_size * _nchannels;
489 CUDA_ERROR_CHECK(cudaMemset((
void*)(*it).d_out, 0, (*it).msize));
491 (*it).pitch_dim = pitch_size/
sizeof(float);
492 PANDA_LOG_DEBUG <<
"Allocated " << (*it).msize
493 <<
" bytes for input rebinned data. " 496 <<
" original row size: " 498 <<
" final row size: " 504 uint64_t mem_size = 0;
506 mem_size = rebin[0].pitch_dim * _nchannels *
sizeof(char);
507 PANDA_LOG_DEBUG <<
"host memory size to map: " 510 << rebin[0].pitch_dim
520 CUDA_ERROR_CHECK(cudaHostAlloc((
void **) h_in, mem_size, cudaHostAllocMapped));
522 CUDA_ERROR_CHECK(cudaHostGetDevicePointer((
void **)d_in, (
void *)*h_in, 0));
523 PANDA_LOG_DEBUG <<
"host memory map success";
531 size_t profile_len = _mfold_phases * _nsubints * _nsubbands *
sizeof(float);
532 mem_size = (uint64_t)profile_len * ncandidates;
535 CUDA_ERROR_CHECK(cudaMalloc((
void **)d_folded, mem_size ));
536 CUDA_ERROR_CHECK(cudaMemset((
void*)*d_folded, 0, mem_size));
537 PANDA_LOG_DEBUG <<
"allocate_global_devmem: allocated " 539 <<
" bytes to store sub-integration folded data";
540 CUDA_ERROR_CHECK(cudaMalloc((
void **)d_weight, mem_size ));
541 CUDA_ERROR_CHECK(cudaMemset((
void*)*d_weight, 0, mem_size));
542 PANDA_LOG_DEBUG <<
"allocate_global_devmem: allocated device memory for weights!";
546 mem_size = ncandidates * _nsubints * _mfold_phases *
sizeof(float);
548 CUDA_ERROR_CHECK(cudaMalloc((
void **)d_outfprof, mem_size ));
549 CUDA_ERROR_CHECK(cudaMemset((
void*)*d_outfprof, 0, mem_size));
553 mem_size = ncandidates * _mfold_phases *
sizeof(float);
555 CUDA_ERROR_CHECK(cudaMalloc((
void **)d_outprof, mem_size ));
556 CUDA_ERROR_CHECK(cudaMemset((
void*)*d_outprof, 0, mem_size));
557 PANDA_LOG_DEBUG <<
"allocate_global_devmem: allocated " << mem_size
558 <<
" bytes to store out profiles data";
559 PANDA_LOG_DEBUG <<
"global memory allocation success";
562 catch (std::runtime_error &e)
564 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in allocate_global_devmem(): " 570 throw panda::Error(
"Caught unknown exception in allocate_global_devmem()");
571 PANDA_LOG_ERROR <<
"Caught unknown exception in allocate_global_devmem()";
590 template<
typename NumericalT>
591 void FldoCuda<NumericalT>::free_dev_memory(NumericalRep *h_in,
float *d_folded,
float *d_weight,
float *d_outprof,
float * d_outfprof)
594 CUDA_ERROR_CHECK(cudaFreeHost(h_in));
597 CUDA_ERROR_CHECK(cudaFree(d_folded));
600 CUDA_ERROR_CHECK(cudaFree(d_weight));
603 CUDA_ERROR_CHECK(cudaFree(d_outprof));
606 CUDA_ERROR_CHECK(cudaFree(d_outfprof));
626 template<
typename NumericalT>
627 void FldoCuda<NumericalT>::folding(ResourceType& gpu, std::vector<util::CandidateRebin> &rebin, NumericalRep *d_in,
628 float *d_folded,
float *d_weight,
int *cand_phases,
int isubint,
float* data_mean,
629 float *data_sigma,
float& total_gpu_time, std::vector<util::GpuStream>& exec_stream)
631 float gpu_time = 0.0f;
632 int first_bin_idx = 0;
636 util::GpuEvent gpu_timer = util::GpuEvent();
638 CUDA_ERROR_CHECK(cudaDeviceSynchronize());
640 gpu_timer.gpu_event_start();
641 first_bin_idx = util::first_valid_binidx(rebin);
643 uint64_t nsamp_subslot = _nsamp/(_nsubints * _nchannels);
644 PANDA_LOG_DEBUG <<
"folding " << nsamp_subslot <<
" time samples" 645 <<
" in isubint " << isubint;
647 util::corner_turner_and_rebin(gpu.device_properties(), first_bin_idx, nsamp_subslot, _nchannels, rebin, d_in);
654 util::GpuStream sstream = util::GpuStream();
658 CUDA_ERROR_CHECK(cudaStreamWaitEvent(sstream.stream(), rebin[first_bin_idx].event, 0));
662 util::statistics_float(rebin[first_bin_idx].d_out, _nchannels * rebin[first_bin_idx].pitch_dim,
663 &mean0, &sigma0, sstream.stream());
664 PANDA_LOG_DEBUG <<
"mean0: " 672 float meanv = mean0 * rebin[first_bin_idx].pitch_dim * rebin[first_bin_idx].rebin/nsamp_subslot;
677 float frac = ((float)rebin[first_bin_idx].pitch_dim*rebin[first_bin_idx].rebin)/nsamp_subslot;
680 float variance_r = frac * sigma0 * sigma0 - mean0*mean0 *(frac -1) -(meanv - mean0)*(meanv-mean0);
682 PANDA_LOG_DEBUG <<
"mean0: " 690 *data_sigma += variance_r;
693 gpu_timer.gpu_event_stop();
694 gpu_time = gpu_timer.gpu_elapsed_time();
695 total_gpu_time += gpu_time;
696 PANDA_LOG_DEBUG <<
"GPU matrix statistics takes " 706 int valid_nbinning = first_bin_idx;
707 PANDA_LOG_DEBUG <<
" valid_nbinning: " 711 for (
int nbinning = valid_nbinning; nbinning < (int) fldo::cuda::max_nrebin ; nbinning ++) {
712 if (rebin[nbinning].first == -1) {
717 if (nbinning > valid_nbinning) {
718 CUDA_ERROR_CHECK(cudaStreamWaitEvent(rebin[nbinning].stream, rebin[valid_nbinning].event, 0));
719 util::rebin_input_data(valid_nbinning, nbinning, _nchannels, rebin);
720 valid_nbinning = nbinning;
723 valid_nbinning = first_bin_idx;
724 for (
int nbinning = valid_nbinning; nbinning < (int) fldo::cuda::max_nrebin ; nbinning ++) {
725 if (rebin[nbinning].first == -1) {
730 for (
int ncand = rebin[nbinning].first; ncand <= rebin[nbinning].last; ncand ++) {
732 int nstream = ncand % exec_stream.size();
734 CUDA_ERROR_CHECK(cudaStreamWaitEvent(exec_stream[nstream].stream(), rebin[nbinning].event, 0));
736 PANDA_LOG_DEBUG <<
"folding kernel for candidate " 738 <<
" sub-integration: " 743 << rebin[nbinning].rebin
745 << cand_phases[ncand];
748 uint64_t nsamp_subslot = _nsamp/(_nsubints * _nchannels * rebin[nbinning].rebin);
749 util::fold_input_data(gpu.device_properties(), d_folded, d_weight, cand_phases,
750 rebin[nbinning], ncand, isubint, _nchannels, _nsubbands, nsamp_subslot,
751 _mfold_phases, _tobs, _enable_split, exec_stream[nstream].stream());
755 CUDA_ERROR_CHECK(cudaGetLastError())
756 gpu_timer.gpu_event_stop();
757 gpu_time = gpu_timer.gpu_elapsed_time();
758 total_gpu_time += gpu_time;
760 PANDA_LOG_DEBUG << "folding_worker takes "
764 catch (
std::runtime_error &e)
766 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in folding(): " << e.what();
771 PANDA_LOG_ERROR <<
"Caught an unknown exception in folding()";
772 throw panda::Error(
"Caught an unknown exception in folding()");
793 template<
typename NumericalT>
794 void FldoCuda<NumericalT>::profiles(
size_t ncandidates,
float mean,
float *d_folded,
float *d_weight,
float *d_outfprof,
795 float* d_outprof,
float& total_gpu_time, std::vector<util::GpuStream> &exec_stream)
800 util::GpuEvent timer = util::GpuEvent();
802 float gpu_time = 0.0f;
804 timer.gpu_event_start();
805 util::build_scrunched_profiles(ncandidates, _mfold_phases, _nsubbands, _nsubints, mean, d_folded,
806 d_weight, d_outfprof, d_outprof, exec_stream);
807 timer.gpu_event_stop();
809 gpu_time = timer.gpu_elapsed_time();
810 PANDA_LOG_DEBUG <<
"normalization + profile takes " << gpu_time <<
" ms";
812 total_gpu_time += gpu_time;
814 catch (std::runtime_error &e)
816 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in profiles(): " << e.what();
821 PANDA_LOG_ERROR <<
"Caught an unknown exception in profiles()";
822 throw panda::Error(
"Caught an unknown exception in profiles()");
839 template<
typename NumericalT>
840 std::shared_ptr<data::Ocld> FldoCuda<NumericalT>::run_folding(ResourceType& gpu,
841 std::vector<std::shared_ptr<TimeFrequencyType>>
const& data,
842 data::Scl input_candidates)
845 std::shared_ptr<data::Ocld> output = data::Ocld::make_shared();
847 TimeType
const tsamp(data[0]->sample_interval());
851 _tsamp = boost::units::quantity_cast<
double>(
static_cast<boost::units::quantity<boost::units::si::time>
>(tsamp));
852 _tobs = (_nsamp/_nchannels) * _tsamp;
854 size_t const ncandidates = input_candidates.size();
857 std::vector <double> freq_sum(_nsubbands);
858 std::vector <double> time_sum(_nsubints);
859 std::vector <double> delta_freq(_nchannels);
860 std::vector <int> cand_phases(ncandidates);
867 std::vector<FrequencyType> chan_freqs = data[0]->channel_frequencies();
868 double freq_chan_1 = chan_freqs[0].value() * chan_freqs[0].value();
869 for (
size_t ich = 0; ich < chan_freqs.size() ; ich ++) {
870 double freq_chan_2 = chan_freqs[ich].value() * chan_freqs[ich].value();
871 delta_freq[ich] = (fldo::k_dm * (1./freq_chan_1 - 1./freq_chan_2));
874 size_t nchan_per_subband = _nchannels/_nsubbands;
875 for (
size_t isband = 0; isband < _nsubbands; isband ++) {
876 for (
size_t ich = 0; ich < nchan_per_subband; ich++) {
877 size_t chan_id = ich + nchan_per_subband * isband;
878 freq_sum[isband] += chan_freqs[chan_id].value();
880 freq_sum[isband]/= nchan_per_subband;
882 const size_t ts_sample = _nsamp / _nsubints / _nchannels;
883 for (
size_t isubint = 0; isubint < _nsubints; isubint ++) {
884 time_sum[isubint] = (-0.5 * _tobs + _tsamp * isubint * ts_sample +_tsamp * (ts_sample - 1) * 0.5);
886 NumericalRep *h_in = NULL;
887 NumericalRep *d_in = NULL;
888 float *d_folded = NULL;
889 float *d_weight = NULL;
890 float *d_outfprof = NULL;
891 float *d_outprof = NULL;
894 float total_gpu_time = 0.;
896 std::vector <util::CandidateRebin> rebin(fldo::cuda::max_nrebin);
899 rebin_candidates(rebin, tsamp, input_candidates);
901 cudaSetDevice(gpu.device_id());
903 load_constant_devmem(rebin, delta_freq, input_candidates, cand_phases);
905 allocate_global_devmem(ncandidates, rebin, &d_in, &h_in, &d_outfprof, &d_outprof, &d_folded, &d_weight);
910 std::vector<util::GpuStream> exec_stream(32);
913 int nsamp_per_subint = _nsamp/_nsubints;
914 PANDA_LOG_DEBUG <<
"numer of samples/subint: " << nsamp_per_subint;
917 for (
int isubint = 0 ; isubint < (int) _nsubints ; isubint ++) {
918 PANDA_LOG_DEBUG <<
"Folding sub-integration " << isubint;
920 std::copy(data[isubint]->begin(), data[isubint]->end() , h_in);
921 folding(gpu, rebin, d_in, d_folded, d_weight, cand_phases.data(), isubint, &mean,
922 &sigma, total_gpu_time, exec_stream);
926 size_t memsize = _mfold_phases * _nsubints * _nsubbands * ncandidates;
929 float *h_folded =
nullptr;
930 CUDA_ERROR_CHECK(cudaHostAlloc((
void **) &h_folded, memsize*
sizeof(
float), cudaHostAllocDefault));
931 memset(h_folded, 0, memsize *
sizeof(
float));
933 CUDA_ERROR_CHECK(cudaMemcpy(h_folded, d_folded, memsize *
sizeof(
float), cudaMemcpyDeviceToHost));
934 for (
size_t ncand = 0; ncand < ncandidates; ncand ++) {
935 std::fstream file_stream;
936 std::stringstream ss;
937 ss <<
"folded_" << ncand <<
".out";
939 file_stream.open(ss.str(), std::ios::out);
940 for (
int isubint = 0; isubint < (int)_nsubints; isubint ++) {
941 file_stream <<
"nsubint: " << isubint << std::endl;
942 for (
int isub = 0; isub < (int)_nsubbands; isub ++) {
943 file_stream <<
"nsubband: " << isub << std::endl;
944 for (
int nbin = 0; nbin < (int)_mfold_phases; nbin ++) {
945 if (nbin < cand_phases[ncand]) {
946 file_stream << nbin <<
" " << h_folded[nbin + isub * _mfold_phases +
947 isubint * _nsubbands * _mfold_phases +
948 ncand * _nsubints * _nsubbands *_mfold_phases] << std::endl;
958 profiles(ncandidates, mean, d_folded, d_weight, d_outfprof, d_outprof, total_gpu_time, exec_stream);
959 PANDA_LOG <<
"Total GPU time of execution (ms): " << total_gpu_time;
964 memsize = _mfold_phases * ncandidates;
965 float *h_outprof =
nullptr;
966 CUDA_ERROR_CHECK(cudaHostAlloc((
void **) &h_outprof, memsize*
sizeof(
float), cudaHostAllocDefault));
967 memset(h_outprof, 0, memsize *
sizeof(
float));
968 CUDA_ERROR_CHECK(cudaMemcpy(h_outprof, d_outprof, memsize *
sizeof(
float), cudaMemcpyDeviceToHost));
969 for (
size_t ncand = 0; ncand < ncandidates; ncand ++) {
970 std::fstream file_stream;
971 std::stringstream ss;
972 ss <<
"reduced_profile_" << ncand <<
".out";
974 file_stream.open(ss.str(), std::ios::out);
975 for (
int nbin = 0; nbin < (int)_mfold_phases; nbin ++) {
976 if (nbin < cand_phases[ncand]) {
977 file_stream << nbin <<
" " << h_outprof[nbin + ncand * _mfold_phases] << std::endl;
985 data::Scl opt_candidates = input_candidates;
986 run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
987 rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 10, 1, 1, 1, 10, 1, sigma, 0);
988 run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
989 rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 1, 1, 10, 1, 10, 3, sigma, 0);
990 run_optimization(input_candidates, opt_candidates, d_folded, d_weight, d_outfprof,
991 rebin, time_sum, freq_sum, total_gpu_time, exec_stream, 10, 3, 10, 3, 1, 1, sigma, 1);
993 free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
995 catch (std::runtime_error &e)
997 free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
998 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in run_folding(): " 1004 free_dev_memory(h_in, d_folded, d_weight, d_outprof, d_outfprof);
1005 PANDA_LOG_ERROR <<
"Caught an unknown exception in run_folding()";
1006 throw panda::Error(
"Caught an unknown exception in run_folding()");
1042 template<
typename NumericalT>
1043 void FldoCuda<NumericalT>::run_optimization(data::Scl
const& ori_data, data::Scl & opt_data,
float *d_folded,
1044 float* d_perturbed,
float *d_outfprof, std::vector<util::CandidateRebin> &rebin,
1045 std::vector<double>
const & time_sum, std::vector <double>
const & freq_sum,
1046 float& total_gpu_time, std::vector<util::GpuStream>& exec_stream,
int p_trial,
int p_rf,
1047 int pdot_trial,
int pdot_rf,
int dm_trial,
int dm_rf,
float sigma,
int verbose)
1049 dim3 threadsPerBlock;
1051 util::GpuEvent gpu_timer = util::GpuEvent();
1053 int nperiod = 0, npdot = 0, ndm = 0;
1054 nperiod = p_trial * 2 - 1;
1055 npdot = pdot_trial * 2 - 1;
1056 ndm = dm_trial * 2 - 1;
1058 int trials_num = nperiod * npdot * ndm;
1060 size_t const ncandidates = ori_data.size();
1061 size_t ph_size = ncandidates * _nsubbands * _nsubints;
1062 std::vector<float>phase_shift(ph_size);
1065 gpu_timer.gpu_event_start();
1067 size_t opt_prof_size = trials_num * ncandidates * _mfold_phases;
1069 panda::nvidia::CudaDevicePointer<float>d_opt_profile(opt_prof_size);
1070 CUDA_ERROR_CHECK(cudaMemset(d_opt_profile(), 0, opt_prof_size *
sizeof(
float)));
1071 CUDA_ERROR_CHECK(cudaDeviceSynchronize());
1072 std::vector<float>trial_param(1);
1073 util::perturb_candidates(ori_data, opt_data, d_folded, d_perturbed, d_outfprof,
1074 d_opt_profile(), exec_stream, time_sum,
1075 freq_sum, trial_param, _nchannels, _nsubints, _nsubbands, _mfold_phases,
1076 _tobs, p_trial, p_rf, pdot_trial, pdot_rf, dm_trial, dm_rf);
1077 double _sigma = sqrt(sigma / _nsubints);
1078 int first_prebin_idx = first_valid_binidx(rebin);
1079 int first_prebin = rebin[first_prebin_idx].rebin;
1086 _sigma *= sqrt((
float)first_prebin / _nsamp);
1088 std::vector<float> sn_max(ncandidates);
1089 std::vector<float> pulse_width(ncandidates);
1090 std::vector<int>index_max(ncandidates);
1092 util::compute_max_sn(ncandidates, trials_num, _mfold_phases, _sigma, d_opt_profile(), sn_max, pulse_width, index_max, exec_stream);
1093 gpu_timer.gpu_event_stop();
1095 total_gpu_time += gpu_timer.gpu_elapsed_time();
1096 PANDA_LOG_DEBUG <<
"perturbation + S/N takes " << gpu_timer.gpu_elapsed_time() <<
" ms";
1097 for (
size_t ncand = 0; ncand < ncandidates; ncand ++) {
1099 int idx = index_max[ncand] * 3 + ncand * trials_num * 3;
1100 data::Scl::CandidateType::MsecTimeType period_val(trial_param[idx] * boost::units::si::seconds);
1101 data::Scl::CandidateType::SecPerSecType pdot_val = trial_param[idx + 1];
1102 data::Scl::CandidateType::Dm dm_val = trial_param[idx + 2] * data::parsecs_per_cube_cm;
1103 opt_data[ncand].period(period_val);
1104 opt_data[ncand].pdot(pdot_val);
1105 opt_data[ncand].dm(dm_val);
1108 PANDA_LOG_DEBUG <<
"Candidate: "<< ncand <<
" (" << opt_data[ncand].ident() <<
")" 1109 <<
" index: " << index_max[ncand]
1110 <<
" S/N: " << sn_max[ncand]
1111 <<
" width: " << pulse_width[ncand]
1112 <<
" period: " << trial_param[idx]
1113 <<
" pdot: " << trial_param[idx + 1]
1114 <<
" dm: " << trial_param[idx + 2];
1116 PANDA_LOG <<
"Candidate: "<< ncand <<
" (" << opt_data[ncand].ident() <<
")" 1117 <<
" index: " << index_max[ncand]
1118 <<
" S/N: " << sn_max[ncand]
1119 <<
" width: " << pulse_width[ncand]
1120 <<
" period: " << trial_param[idx]
1121 <<
" pdot: " << trial_param[idx + 1]
1122 <<
" dm: " << trial_param[idx + 2];
1126 catch (std::runtime_error &e)
1128 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in run_optimization(): " 1134 throw panda::Error(
"Catch unknown exception in run_optimization()");
1135 PANDA_LOG_ERROR <<
"Catch unknown exception in run_optimization()";
1152 template<
typename NumericalT>
1153 void FldoCuda<NumericalT>::data_reshape(std::vector<std::shared_ptr<TimeFrequencyType>>
const& input_data,
1154 std::vector<std::shared_ptr<TimeFrequencyType>>
const& data)
1162 template class FldoCuda<uint8_t>;
1165 #endif //ENABLE_CUDA
Some limits and constants for FLDO.