85 __global__
void folding_worker(
float *g_in,
float *folded,
float *weight,
86 int pitch_dim,
int local_measures,
int candidate,
87 int rebin,
int shared_phases,
int warp_count,
88 double tobs,
int max_phases,
int isubint,
int threadblock_memory)
98 int subband_idx = blockIdx.x;
99 int chan_idx = threadIdx.y;
104 int start_phase = max_phases * subband_idx + isubint * max_phases * gridDim.x +
105 candidate * d_nsubints * max_phases *gridDim.x;
108 int nbins = d_nbins[candidate];
119 int start_data = subband_idx * d_nchan_per_band * pitch_dim;
121 float input_temp = 0;
122 float frac_floor = 0.;
124 float bin2_frac = 0.;
126 extern __shared__
float hist_profile[];
127 int idx = threadIdx.x + blockDim.x * threadIdx.y;
133 float *warp_profile = hist_profile + (idx >> 5) * shared_phases;
134 float *hist_weight = hist_profile + warp_count * shared_phases;
135 float *warp_weight= hist_weight + (idx >> 5) * shared_phases;
138 for (
int i = idx; i < threadblock_memory ; i += blockDim.x * blockDim.y) {
139 hist_profile[i] = 0.;
145 int nchan = d_nchan_per_band * subband_idx;
146 int nsamp = local_measures * isubint;
148 double _tt = d_tsamp * rebin;
149 double nudot_contrib = 0.;
150 double t0 = -tobs * 0.5;
151 for (time_idx = threadIdx.x ; time_idx < local_measures; time_idx += blockDim.x) {
152 start_idx = start_data + time_idx;
153 time_s = t0 + _tt *(nsamp + time_idx);
154 nudot_contrib = time_s * time_s * d_nudot[candidate] * 0.5;
155 for (chan_idx = threadIdx.y ; chan_idx < d_nchan_per_band; chan_idx += blockDim.y) {
156 channel = chan_idx + nchan;
158 input_temp = g_in[start_idx + chan_idx * pitch_dim];
160 phase = (time_s + d_dm[candidate] * d_delta_freq[channel]) *d_nu[candidate] +
162 frac = (float) ((phase - floor(phase)) * nbins);
163 bin2_frac = modff(frac, &frac_floor);
186 bin1 = (int)frac_floor;
187 val2 = bin2_frac * input_temp;
210 atomicAdd((
float *)&warp_profile[bin1], input_temp - val2);
211 atomicAdd((
float *)&warp_profile[bin2], val2);
212 atomicAdd((
float *)&warp_weight[bin1], (
float)(1. - bin2_frac));
213 atomicAdd((
float *)&warp_weight[bin2], (
float)bin2_frac);
232 for (
int bin = idx; bin <= nbins + 1; bin += nbins) {
233 for (
int iwarp = 0; iwarp < warp_count; iwarp ++) {
234 sum += hist_profile[bin + iwarp * shared_phases];
235 sum1 += hist_weight[bin + iwarp * shared_phases];
238 folded[idx + start_phase] = sum;
239 weight[idx + start_phase] = sum1;
275 __global__
void folding_worker_nosplit(
float *g_in,
float *folded,
float *weight,
276 int pitch_dim,
int local_measures,
int candidate,
277 int rebin,
int shared_phases,
int warp_count,
278 double tobs,
int max_phases,
int isubint,
int threadblock_memory)
288 int subband_idx = blockIdx.x;
289 int chan_idx = threadIdx.y;
294 int start_phase = max_phases * subband_idx + isubint * max_phases * gridDim.x +
295 candidate * d_nsubints * max_phases *gridDim.x;
298 int nbins = d_nbins[candidate];
308 int start_data = subband_idx * d_nchan_per_band * pitch_dim;
310 float input_temp = 0;
311 extern __shared__
float hist_profile[];
312 int idx = threadIdx.x + blockDim.x * threadIdx.y;
318 float *warp_profile = hist_profile + (idx >> 5) * shared_phases;
319 float *hist_weight = hist_profile + warp_count * shared_phases;
320 float *warp_weight= hist_weight + (idx >> 5) * shared_phases;
323 for (
int i = idx; i < threadblock_memory ; i += blockDim.x * blockDim.y) {
324 hist_profile[i] = 0.;
330 int nchan = d_nchan_per_band * subband_idx;
331 int nsamp = local_measures * isubint;
333 float nudot_contrib = 0.;
334 double t0 = -tobs * 0.5;
335 for (time_idx = threadIdx.x ; time_idx < local_measures; time_idx += blockDim.x) {
336 start_idx = start_data + time_idx;
337 time_s = t0 + d_tsamp * rebin *(nsamp + time_idx);
338 nudot_contrib = time_s * time_s * d_nudot[candidate] * 0.5;
339 for (chan_idx = threadIdx.y ; chan_idx < d_nchan_per_band; chan_idx += blockDim.y) {
340 channel = chan_idx + nchan;
341 input_temp = g_in[start_idx + chan_idx * pitch_dim];
343 phase = (time_s +d_dm[candidate] * d_delta_freq[channel]) *d_nu[candidate] +
345 bin1 = (int)((phase - floor(phase)) * nbins);
381 atomicAdd((
float *)&warp_weight[bin1], 1.);
382 atomicAdd((
float *)&warp_profile[bin1], input_temp);
399 for (
int bin = idx; bin < nbins + 1; bin += nbins) {
400 for (
int iwarp = 0; iwarp < warp_count; iwarp ++) {
401 sum += hist_profile[bin + iwarp * shared_phases];
402 sum1 += hist_weight[bin + iwarp * shared_phases];
405 folded[idx + start_phase] = sum;
406 weight[idx + start_phase] = sum1;
Some limits and constants for FLDO.