Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
FoldInputDataKernel.cu
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
46 // includes, system
47 
48 #include <float.h>
49 
50 namespace ska {
51 namespace cheetah {
52 namespace fldo {
53 namespace cuda {
54 //
55 // void folding_worker(float *g_in, float *folded, float *weight, int local_measures,
56 // int candidate, int warp_count,
57 // double tobs,
58 // int max_phases, int isubint,
59 // int threadblock_memory)
60 //
61 // g_in input converted data
62 // folded array to store folded data
63 // weight array to store weight data
64 // pitch_dim the data array row length (padded with 0)
65 // local_measure number of sample per channel and per subint ( <= pitch_dim)
66 // candidate the candidate number
67 // warp_count the number of warp programmed on each SM
68 // tobs the observation time (to set as constant in future ??)
69 // max_phases the maximum number of phase bins for the current candidate list
70 // isubint the current subintegration number
71 // threadblock_memory number of shared memory elements.
72 //
73 //
74 // Arrangement of the data into folded and weight arrays for the first
75 // candidate. The same organization is replicated for each other candidate
76 // [0, 127]: phase bins range
77 // ---------------------------------------------------------------------------------------
78 // |0|1|...|127 |0|1|...|127|...|0|1|...|127|0|1|...|127|0|1|...|127|...|0|1|...|127|....
79 // ---------------------------------------------------------------------------------------
80 // |<-band 0 ->|<-band 1 ->|...|<-band N ->|<-band 0 ->|<-band 1 ->|...|<-band N ->|
81 // <------------- subint 0 --------------->|<----------- subint 1 ----------------> ... <-- subint N ->
82 // <------------ candidate 0 -------->
83 //
84 //
85 __global__ void folding_worker(float *g_in, float *folded, float *weight,
86  int pitch_dim, int local_measures, int candidate,
87  int rebin, int shared_phases, int warp_count,
88  double tobs, int max_phases, int isubint, int threadblock_memory)
89 {
90 
91  /* threadIdx.x: the current sample inside thread block
92  * threadIdx.y the current channel inside thread block
93  * blockDim.x the stride in samples
94  * blockIdx.x: subband index
95  * blockDim.y the stride in channels
96  * gridDim.x: number of subbands
97  */
98  int subband_idx = blockIdx.x; // the sub-band index (0, 1, ...64)
99  int chan_idx = threadIdx.y; // index of channel in current subband
100  // (0, 1, .., n_chan_band)
101 
102  int time_idx = 0; // index to subint measures
103  // start of phase array for current operations
104  int start_phase = max_phases * subband_idx + isubint * max_phases * gridDim.x +
105  candidate * d_nsubints * max_phases *gridDim.x;
106  int channel = 0; // index of local channel
107  // To be computed into CPU and put in a global tabellar form (DONE on 24/12/2013)
108  int nbins = d_nbins[candidate]; // total number of phases of the candidate
109  // physical values.
110  // The extra delay in for this channel based on the DM
111  // equation (see Lorimer & Kramer book)
112  // time of local sample
113  double time_s = 0.; // <--- WARNING!!!!!!!!!!!!
114  // float has only 7 digit mantissa. for a full 600 sec datafile this is not enough!
115  double phase = 0.; // phase of single measure
116 
117  int bin1 = 0; // profile index of the first phase bin
118  int bin2 = 0; // profile index of the second phase bin
119  int start_data = subband_idx * d_nchan_per_band * pitch_dim;
120  int start_idx = 0; //index of the first time sample of each subband
121  float input_temp = 0; //the input sample value
122  float frac_floor = 0.;
123  float frac = 0;
124  float bin2_frac = 0.; //the second phase bin weight
125  float val2 = 0.; //the second phase bin intensity value
126  extern __shared__ float hist_profile[]; //dinamically allocated shared memory
127  int idx = threadIdx.x + blockDim.x * threadIdx.y; //thread global id
128  // (idx >> 5) gives the warp number
129  // 1 warp = 32 threads
130  //
131  // Att: if for the future GPUs 1 warp != 32, next lines has to be changed!!!
132  // The warp_size is obtained by the cudaDeviceProp structure.
133  float *warp_profile = hist_profile + (idx >> 5) * shared_phases;
134  float *hist_weight = hist_profile + warp_count * shared_phases;
135  float *warp_weight= hist_weight + (idx >> 5) * shared_phases;
136 
137  // zero the shared memory
138  for (int i = idx; i < threadblock_memory ; i += blockDim.x * blockDim.y) {
139  hist_profile[i] = 0.;
140  hist_weight[i] = 0.;
141  }
142  __syncthreads();
143 
144  // loop on time measures (local_measures at time, nsamp/fold.nsubints)
145  int nchan = d_nchan_per_band * subband_idx; //number of the first channel of the subband
146  int nsamp = local_measures * isubint; //number of the first time sample of the
147  //sub-integration
148  double _tt = d_tsamp * rebin;
149  double nudot_contrib = 0.;
150  double t0 = -tobs * 0.5;
151  for (time_idx = threadIdx.x ; time_idx < local_measures; time_idx += blockDim.x) {
152  start_idx = start_data + time_idx; // sample position into the data array
153  time_s = t0 + _tt *(nsamp + time_idx); // sample position in time
154  nudot_contrib = time_s * time_s * d_nudot[candidate] * 0.5;//phase shift due to acceleration
155  for (chan_idx = threadIdx.y ; chan_idx < d_nchan_per_band; chan_idx += blockDim.y) {
156  channel = chan_idx + nchan; // index of local channel
157  //input_temp = __ldg(&ptr[chan_idx * pitch_dim]);
158  input_temp = g_in[start_idx + chan_idx * pitch_dim];
159  // we compute phase
160  phase = (time_s + d_dm[candidate] * d_delta_freq[channel]) *d_nu[candidate] +
161  nudot_contrib;
162  frac = (float) ((phase - floor(phase)) * nbins);
163  bin2_frac = modff(frac, &frac_floor);
164 
165  //bin1 = ((int)frac_floor) % nbins;
166  //bin1 = ((int)frac_floor);
167 
168  // ATT:
169  // bin1 can be less or equal to nbins (nbins is the max number
170  // of phase bins for the specific candidate), so the mod
171  // operation is not necessary.
172  // To avoid mod operation also in the calculation of bin2
173  // (to limit the number of internal registers used) it may
174  // happen that the data are stored also in the element (nbins + 1) of
175  // the profile array.
176  // In the computation of the final profile we will add the
177  // value of element nbins to that stored in element 0,
178  // and the one of the element (nbins + 1) in element 1.
179  //
180  //bin2 = ((int)ceilf(frac)) % nbins;
181  //bin2 = ((int)ceilf(frac));
182  //
183  // if at the end we sum up to the last phase
184  // (nbins) to the first one (0)
185  // OSS: here frac > 0 so the integer part is equal to floor(frac)
186  bin1 = (int)frac_floor;
187  val2 = bin2_frac * input_temp;
188  bin2 = (bin1 + 1);
189  // input Index is computed as follows:
190  // time_idx is the index of the measures (along time)
191  // of the current subint of the current channel
192  // chan_idx * local_measures is the offset of start of
193  // measure of current channel from the beginning of
194  // channel block
195  // The following changes only in the outer loop:
196  // n_chan_band * blockIdx.x * local_measures is the offset
197  // the start of measure of current channel block
198  // input_temp = g_in[time_idx + chan_idx * local_measures +
199  // n_chan_band * blockIdx.x * local_measures];
200 
201  // output
202  // folded is structured as a (multidim) array:
203 
204  // max_phases number of phases
205  // weight is exactly the same.
206  // start_phase = candidate * blockDim.x + subband_idx * gridDim.x
207 
208  //read and update data use atomic functions because there can be
209  //collisions between threads accessing the same address
210  atomicAdd((float *)&warp_profile[bin1], input_temp - val2);
211  atomicAdd((float *)&warp_profile[bin2], val2);
212  atomicAdd((float *)&warp_weight[bin1], (float)(1. - bin2_frac));
213  atomicAdd((float *)&warp_weight[bin2], (float)bin2_frac);
214 
215  }
216  }
217  __syncthreads();
218  float sum = 0;
219  float sum1 = 0;
220  //
221  // ATT: since we do not use the mod operator to limit the number of
222  // phases up to the maximum value (nbins), we have to take into
223  // account also the profile elements at index = nbins and index =
224  // (nbins + 1).
225  // The next procedure takes care to sum up these elements
226  // respectively to element 0 and 1.
227  // !! However this means that we can't elaborate data belonging to
228  // !! candidates with a number of phase bins > 126, assuming that our max
229  // !! number of phase bins is 128.
230  //
231  if (idx < nbins) {
232  for (int bin = idx; bin <= nbins + 1; bin += nbins) {
233  for (int iwarp = 0; iwarp < warp_count; iwarp ++) {
234  sum += hist_profile[bin + iwarp * shared_phases];
235  sum1 += hist_weight[bin + iwarp * shared_phases];
236  }
237  }
238  folded[idx + start_phase] = sum;
239  weight[idx + start_phase] = sum1;
240  }
241 }
242 
243 
244 //
245 // void folding_worker_nosplit(float *g_in, float *folded, float *weight, int local_measures,
246 // int candidate, int warp_count,
247 // double tobs,
248 // int max_phases, int isubint,
249 // int threadblock_memory)
250 //
251 // g_in input converted data
252 // folded array to store folded data
253 // weight array to store weight data
254 // pitch_dim the data array row length (padded with 0)
255 // local_measure number of sample per channel and per subint ( <= pitch_dim)
256 // candidate the candidate number
257 // warp_count the number of warp programmed on each SM
258 // tobs the observation time (to set as constant in future ??)
259 // max_phases the maximum number of phase bins for the current candidate list
260 // isubint the current subintegration number
261 // threadblock_memory number of shared memory elements.
262 //
263 //
264 // Arrangement of the data into folded and weight arrays for the first
265 // candidate. The same organization is replicated for each other candidate
266 // [0, 127]: phase bins range
267 // ---------------------------------------------------------------------------------------
268 // |0|1|...|127 |0|1|...|127|...|0|1|...|127|0|1|...|127|0|1|...|127|...|0|1|...|127|....
269 // ---------------------------------------------------------------------------------------
270 // |<-band 0 ->|<-band 1 ->|...|<-band N ->|<-band 0 ->|<-band 1 ->|...|<-band N ->|
271 // <------------- subint 0 --------------->|<----------- subint 1 ----------------> ... <-- subint N ->
272 // <------------ candidate 0 -------->
273 //
274 //
275 __global__ void folding_worker_nosplit(float *g_in, float *folded, float *weight,
276  int pitch_dim, int local_measures, int candidate,
277  int rebin, int shared_phases, int warp_count,
278  double tobs, int max_phases, int isubint, int threadblock_memory)
279 {
280 
281  /* threadIdx.x: the current sample inside thread block
282  * threadIdx.y the current channel inside thread block
283  * blockDim.x the stride in samples
284  * blockIdx.x: subband index
285  * blockDim.y the stride in channels
286  * gridDim.x: number of subbands
287  */
288  int subband_idx = blockIdx.x; // the sub-band index (0, 1, ...64)
289  int chan_idx = threadIdx.y; // index of channel in current subband
290  // (0, 1, .., n_chan_band)
291 
292  int time_idx = 0; // index to subint measures
293  // start of phase array for current operations
294  int start_phase = max_phases * subband_idx + isubint * max_phases * gridDim.x +
295  candidate * d_nsubints * max_phases *gridDim.x;
296  int channel = 0; // index of local channel
297  // To be computed into CPU and put in a global tabellar form (DONE on 24/12/2013)
298  int nbins = d_nbins[candidate]; // total number of phases of the candidate
299  // physical values.
300  // The extra delay in for this channel based on the DM
301  // equation (see Lorimer & Kramer book)
302  // time of local sample
303  double time_s = 0.; // <--- WARNING!!!!!!!!!!!!
304  // float has only 7 digit mantissa. for a full 600 sec datafile this is not enough!
305  double phase = 0.; // phase of single measure
306 
307  int bin1 = 0; // profile index of the first phase bin
308  int start_data = subband_idx * d_nchan_per_band * pitch_dim;
309  int start_idx = 0; //index of the first time sample of each subband
310  float input_temp = 0; //the input sample value
311  extern __shared__ float hist_profile[]; //dinamically allocated shared memory
312  int idx = threadIdx.x + blockDim.x * threadIdx.y; //thread global id
313  // (idx >> 5) gives the warp number
314  // 1 warp = 32 threads
315  //
316  // Att: if for the future GPUs 1 warp != 32, next lines has to be changed!!!
317  // The warp_size is obtained by the cudaDeviceProp structure.
318  float *warp_profile = hist_profile + (idx >> 5) * shared_phases;
319  float *hist_weight = hist_profile + warp_count * shared_phases;
320  float *warp_weight= hist_weight + (idx >> 5) * shared_phases;
321 
322  // zero the shared memory
323  for (int i = idx; i < threadblock_memory ; i += blockDim.x * blockDim.y) {
324  hist_profile[i] = 0.;
325  hist_weight[i] = 0.;
326  }
327  __syncthreads();
328 
329  // loop on time measures (local_measures at time, nsamp/fold.nsubints)
330  int nchan = d_nchan_per_band * subband_idx; //number of the first channel of the subband
331  int nsamp = local_measures * isubint; //number of the first time sample of the
332  //sub-integration
333  float nudot_contrib = 0.;
334  double t0 = -tobs * 0.5;
335  for (time_idx = threadIdx.x ; time_idx < local_measures; time_idx += blockDim.x) {
336  start_idx = start_data + time_idx; // sample position into the data array
337  time_s = t0 + d_tsamp * rebin *(nsamp + time_idx); // sample position in time
338  nudot_contrib = time_s * time_s * d_nudot[candidate] * 0.5; //phase shift due to acceleration
339  for (chan_idx = threadIdx.y ; chan_idx < d_nchan_per_band; chan_idx += blockDim.y) {
340  channel = chan_idx + nchan; // index of local channel
341  input_temp = g_in[start_idx + chan_idx * pitch_dim];
342  // we compute phase
343  phase = (time_s +d_dm[candidate] * d_delta_freq[channel]) *d_nu[candidate] +
344  nudot_contrib;
345  bin1 = (int)((phase - floor(phase)) * nbins);
346  // ATT:
347  // bin1 can be less or equal to nbins (nbins is the max number
348  // of phase bins for the specific candidate), so the mod
349  // operation is not necessary.
350  // To avoid mod operation (to limit the number of internal registers used) it may
351  // happen that the data are stored also in the element nbins of
352  // the profile array.
353  // In the computation of the final profile we will add the
354  // value of element nbins to that stored in element 0,
355  //
356  //
357  // if at the end we sum up to the last phase
358  // (nbins) to the first one (0)
359  // OSS: here frac > 0 so the integer part is equal to floor(frac)
360  // input Index is computed as follows:
361  // time_idx is the index of the measures (along time)
362  // of the current subint of the current channel
363  // chan_idx * local_measures is the offset of start of
364  // measure of current channel from the beginning of
365  // channel block
366  // The following changes only in the outer loop:
367  // n_chan_band * blockIdx.x * local_measures is the offset
368  // the start of measure of current channel block
369  // input_temp = g_in[time_idx + chan_idx * local_measures +
370  // n_chan_band * blockIdx.x * local_measures];
371 
372  // output
373  // folded is structured as a (multidim) array:
374 
375  // max_phases number of phases
376  // weight is exactly the same.
377  // start_phase = candidate * blockDim.x + subband_idx * gridDim.x
378 
379  //read and update data use atomic functions because there can be
380  //collisions between threads accessing the same address
381  atomicAdd((float *)&warp_weight[bin1], 1.);
382  atomicAdd((float *)&warp_profile[bin1], input_temp);
383  }
384  }
385  __syncthreads();
386  float sum = 0;
387  float sum1 = 0;
388  //
389  // ATT: since we do not use the mod operator to limit the number of
390  // phases up to the maximum value (nbins), we have to take into
391  // account also the profile element at index = nbins.
392  // The next procedure takes care to sum up this element
393  // respectively to element 0.
394  // !! However this means that we can't elaborate data belonging to
395  // !! candidates with a number of phase bins > 127, assuming that our max
396  // !! number of phase bins is 128.
397  //
398  if (idx < nbins) {
399  for (int bin = idx; bin < nbins + 1; bin += nbins) {
400  for (int iwarp = 0; iwarp < warp_count; iwarp ++) {
401  sum += hist_profile[bin + iwarp * shared_phases];
402  sum1 += hist_weight[bin + iwarp * shared_phases];
403  }
404  }
405  folded[idx + start_phase] = sum;
406  weight[idx + start_phase] = sum1;
407  }
408 }
409 } //cuda
410 } //fldo
411 } //cheetah
412 } //ska
413 
Some limits and constants for FLDO.
Definition: Brdz.h:35