Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
Kernels.cu
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
46 // includes, system
47 
48 #include <float.h>
49 #include <stdio.h>
50 
51 namespace ska {
52 namespace cheetah {
53 namespace fldo {
54 namespace cuda {
55 //
56 // void input_converter(unsigned char *g_in, float *g_out, int local_measures, int nchan)
57 //
58 // g_in the input data read from file
59 // g_out the converted output data as float
60 // local_measure the number of sample per channel and for sub-integration
61 // nchan the global number of channel
62 //
63 __global__ void input_converter(unsigned char *g_in, float *g_out, int pitch_dim,
64  int local_measures, int nchan)
65 {
66  int i = threadIdx.x + blockIdx.x * blockDim.x;
67  int j = threadIdx.y + blockIdx.y * blockDim.y;
68 
69  if ((i < local_measures) && (j < nchan)) {
70  g_out[i + pitch_dim * j] = (float)g_in[i + pitch_dim *j] - 128;
71  }
72 }
73 
74 //
75 // __global__ void normalize_kernel(float *folded, float *weight, int ncand,
76 // float mean, int nsubband)
77 //
78 // folded global array with folded data
79 // weight global array with weights
80 // ncand the candidate id
81 // mean the input data mean
82 // nsubband the number of freq. bands
83 //
84 __global__ void normalize_kernel(float *folded, float *weight, int ncand,
85  float mean, int nsubbands)
86 {
87  int n; //loop counter
88  int idx0 = threadIdx.x + blockIdx.x*blockDim.x * nsubbands + ncand *
89  gridDim.x * blockDim.x * nsubbands;
90  int sub_idx = 0; //subband index (0, 1, .., 63)
91  int idx = 0;
92  float profile = 0.;
93  float counts = 0.;
94 
95  /*
96  * threadIdx.x the bin number
97  * blockDim.x max number of bins
98  * blockIdx.x sub-int number (0, 1, ..., nsubints - 1)
99  * gridDim.x number of sub-integrations (nsubints)
100  */
101  //mean = -0.353104; // Debug: mean value of test_data as commputed by Mike
102 
103  // idx0: the index of the bin of the subband 0 of the current subint
104  // loop on the band
105  for (n = 0; n < nsubbands; n ++) {
106  sub_idx = n * blockDim.x; // the subband index
107  idx = idx0 + sub_idx; // the global bin index for the current candidate
108  counts = weight[idx];
109  profile = 0.;
110  if (counts > FLT_EPSILON) {
111  // here we subctract the mean value of data
112  profile = folded[idx];
113  profile = profile/counts - mean;
114  }
115  folded[idx] = profile;
116 
117  }
118 }
119 
120 //
121 // __global__ void profile_kernel(float *in, float *out, int ncand, int nloop)
122 //
123 // Builds intermediate and final profile
124 // This kernel is called two times: in the first call is created the
125 // profile in freq (an array with Ncand*subints*max_phases elements
126 // In the second call is build the final profile (an array with
127 // max_phases * Ncand elements)
128 //
129 // in the input array
130 // first call : d_folded
131 // second call: d_outprof
132 // out the output array
133 // first call : d_outprof
134 // second call: d_outprof
135 // ncand the candidate id
136 // nloop the number of repetition:
137 // first call = nsubbands
138 // second call = nsubints
139 __global__ void profile_kernel(float *in, float *out, int ncand, int nloop)
140 {
141  int idx0 = threadIdx.x + blockIdx.x * blockDim.x * nloop +
142  ncand * blockDim.x * gridDim.x * nloop;
143  int sub_idx = 0; // shift depending on loop counter
144  int idx; // index inside data array
145  float profile = 0.;
146  float ftscrunch = 0;
147 
148  //
149  // threadIdx.x bin
150  // blockDim.x max number of bins
151  // blockIdx.x first call : sub-int number (0, 1, ..., nsubints - 1)
152  // second call: 0
153  // gridDim.x first call : number of sub-integrations (nsubints)
154  // second call: 1
155  //
156  // loop on the subband/subints
157  for (int n = 0; n < nloop; n ++) {
158  sub_idx = n* blockDim.x;
159  idx = idx0 + sub_idx;
160  profile = in[idx];
161  ftscrunch += profile;
162  }
163  out[threadIdx.x + blockIdx.x * blockDim.x + ncand * blockDim.x*gridDim.x] =
164  ftscrunch/nloop;
165 }
166 
167 //
168 //__global__ void best_profile_kernel(float *in, float *out, int ncand,
169 // int nloop, int index, int ntrial)
170 //
171 // Build the signal profile for all the trials.
172 //
173 // in the profile in freq.
174 // out an array with the signal profile for each 3-tuple of trial values
175 // ncand the candidate number
176 // index the trial index identifying the 3-tuple
177 // ntrial the max number of trials
178 __global__ void best_profile_kernel(float *in, float *out, int ncand,
179  int nloop, int index, int ntrial)
180 {
181  int idx0 = threadIdx.x + ncand * blockDim.x * nloop; // starting index
182  int sub_idx = 0; // shift depending on loop counter
183  int idx; // index inside data array
184  float profile = 0.;
185  float ftscrunch = 0;
186  //
187  // threadIdx.x bin
188  // blockDim.x max number of bins
189  //
190  // loop on the subband/subints
191  for (int n = 0; n < nloop; n ++) {
192  sub_idx = n* blockDim.x;
193  idx = idx0 + sub_idx;
194  profile = in[idx];
195  ftscrunch += profile;
196  }
197  out[threadIdx.x + index * blockDim.x + ncand * blockDim.x * ntrial] = ftscrunch/nloop;
198 }
199 
200 //
201 //
202 //__global__ void perturbate_kernel(float *g_in, float *g_out,
203 // double delta_nu, double delta_dm,
204 // double delta_nudot, int candidate,
205 // int max_phases, int index)
206 // g_in the input data to be corrected
207 // g_out the output data pertubed
208 // ncand the candidate number
209 // delta_nu perturbation in freq.
210 // delta_dm perturbation in dm (perturbed = original*delta_dm)
211 // delta_pdot perturbation in pdot (perturbed = original*delta_pdot)
212 // candidate the candidate number
213 // max_phases maximum number of phases
214 // index the global index used to address the set of three parameters used
215 // in the optimization phase. This value is only used for
216 // debug purpose. To remove later!!
217 //
218 //
219 __global__ void perturbate_kernel(float *g_in, float *g_out,
220  float*phase_shift, int candidate,
221  int max_phases, int index)
222 {
223  //
224  // threadIdx.x phase bin
225  // blockIdx.x the sub-band number
226  // gridDim.x the number of subbands
227  // blockIdx.y the sub-integ number
228  // gridDim.y the number of sub-integ
229  //
230  int subband_idx = threadIdx.y + blockIdx.x * blockDim.y; // the sub-band index (0, 1, ...64)
231 
232  int nsubbands = gridDim.x * blockDim.y;
233  // start phase: the shift to the next profile of max_phases bins.
234  int start_phase = max_phases * subband_idx + blockIdx.y * max_phases * gridDim.x * blockDim.y +
235  + candidate * max_phases *gridDim.x * blockDim.y * gridDim.y;
236  int ibin = 0;
237  float ph_shift = phase_shift[subband_idx + blockIdx.y * nsubbands + candidate * nsubbands * gridDim.y];
238  int nbins = d_nbins[candidate];
239  int bin_off = (int)(ph_shift * nbins + 0.5); // the offset in bins of the perturbed profile
240  extern __shared__ float indata[];
241 
242  for (int idx = threadIdx.x; idx < max_phases; idx += blockDim.x) {
243  //null the output array
244  g_out[idx + start_phase] = 0;
245  indata[threadIdx.x] = 0;
246  }
247  __syncthreads();
248  for (int idx = threadIdx.x; idx < nbins; idx += blockDim.x) {
249  ibin = (idx + bin_off) % nbins;
250  indata[idx + max_phases * threadIdx.y] = g_in[start_phase + ibin];
251  g_out[idx + start_phase] = indata[idx + max_phases * threadIdx.y];
252  }
253 }
254 
255 //
256 //
257 //__global__ void compute_sn(float* profile, float* best, int max_phases, int cand,
258 // int nelem, float sigma)
259 //
260 //
261 // profile the profile array
262 // best the array to store best result during pertubation
263 // max_phases the max number of phases
264 // cand the candidate index
265 // nelem the total number of pertubation applied
266 // sigma the profile standard deviation less a nbins correction (see comment below)
267 //
268 //
269 // From the "Handbbok of Pulsar astronomy" (pag 167): the most commonly-used measure
270 // of profile significance is the signal to noise ratio, S/N ususally
271 // defined as:
272 //
273 // S/N = 1/(_sigma* (sqrt(Weff))) * sum (p_i - p_mean) for i = 0,.., nbins
274 //
275 // Weff the equivalent width (in bins) of a top-hat pulse with the
276 // same area and peak height as the observad profile
277 // p_i the amplitude of the i-th bin of the profile
278 // p_mean off pulse mean calculated for on the raw time series
279 // _sigma = sigma * (sqrt(nbins))
280 // sigma is the standard deviation calculated for the raw time series
281 // ( sqr( (<X^2> - <X>^2) / (num(X)-1) Where X are input
282 // values both in time and in frequency )
283 // nsamples the total number of samples in the time series
284 //
285 // The calculation of S/N is repetead for each (dm, nu, nudot) combination
286 // and for each combination is taken the max value.
287 // The number of combinations (trials) depends on the value of range and
288 // step choosen for each of the 3 dimensions.
289 //
290 // The kernel runs a thread for each trial.
291 // If the total number of trials is > 1024 (the max number of threads
292 // programmable for each SM), the threads (= trial) has to be divided in
293 // block.
294 
295 // OSS: the last block may contain a number of threads < 1024. So it
296 // is necessary to compare the global thread id number (= trial index)
297 // with the max number of trials passed to the kernel as argument.
298 
299 __global__ void compute_sn(float* profile, float* best, int max_phases, int ncand, int nelem, float sigma)
300 {
301  //
302  // threadIdx.x the trial number inside a thread block
303  // blockDimx.x the number of trials/per block
304  // blockIdx.x the block id:
305  // gridDim.x the number trials block
306  //
307  int width; // impulse width in bin
308  int nelem_per_block = blockDim.x; // the number of pertubations per thread block
309  float curval = 0; // the profile value inside the box
310  float sn = 0; // the signal-to-noise value
311  float b_width = 0; // the best pulse width in physical unit
312  float b_sn = - FLT_MAX; // the best signal-to-noise value
313  float maxval = 0.;
314  float _sigma; // standard deviation of the profile
315  int tid = threadIdx.x; // local thread id
316  int gid = threadIdx.x + blockIdx.x * blockDim.x; // global thread id = trial index
317  int start = gid * max_phases + ncand * nelem * max_phases;
318  extern float __shared__ result[];
319 
320  int nbins = d_nbins[ncand]; // the real number of phases for the current candidate
321  _sigma = sigma * sqrt((float)nbins);
322  //_sigma = sigma;
323  // Warning!!! Last arg should be sigma*sqrt((float) (nbins[ncand] * first_prebin))
324  // but in calling routine we do not have nbins[], so we make computation here
325  result[tid] = 0;
326 
327  if (gid < nelem) {
328  for (width = 1; width < nbins >> 1; width ++) {
329  curval = 0;
330  for (int ibin = 0; ibin < width; ibin++){ // initial guess
331  curval += profile[start + ibin % nbins];
332  }
333 
334  maxval = curval;
335  // To get maximum value we shift the 'active' window on the phase
336  // array. To save computation on each step we start from the value
337  // of preceding try and then we get the shift of window by dropping
338  // its first value and adding the first bin after its end. The
339  // assumed topology is toroidal: last phase bin is contigous to the first
340  for (int ibin = 0; ibin < nbins; ibin++){
341  int addbin = ibin + width;
342  curval -= profile[ start + ibin % nbins];
343  curval += profile[ start + addbin % nbins];
344  if (maxval < curval) {
345  maxval = curval;
346  }
347  }
348  sn = maxval /sqrt((float)width)/_sigma;
349  if (sn > b_sn) {
350  b_sn = sn;
351  b_width = width;
352  }
353  }
354  // store the best sn and width values in shared memory
355  //
356  result[tid] = b_sn;
357  result[tid + nelem_per_block] = b_width/(d_nu[ncand] * nbins);
358  __syncthreads();
359  // copy to global memory all the nelem values for sn and width
360  best[gid + ncand * nelem * 2] = result[tid];
361  best[gid + nelem + ncand * nelem * 2] = result[tid + nelem_per_block];
362  }
363  return ;
364 }
365 
366 //
367 // __global__ void transposeNoBankConflicts(float *odata, unsigned char *idata,
368 // int width, int height, int nsamp)
369 //
370 // Executes the transpose of input data and convert it from unsigned to float.
371 //
372 // odata the output data matrix
373 // idata the input data matrix
374 // width the matrix width (= pitch dimension to get aligned memory data)
375 // height the matrix heigth(= number of freq. channels)
376 // nsamp the number of samples per sub-integration and per channel
377 //
378 __global__ void transposeNoBankConflicts(float *odata, unsigned char *idata,
379  int width, int height, int nsamp)
380 {
381  extern __shared__ char tile[]; //shared memory size allocated dinamically
382  int tile_dimx = blockDim.x; // number of threads in x direction
383  int tile_dimy = blockDim.y; // number of threads in y direction
384 
385  // null the shared memory
386  tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
387  __syncthreads();
388 
389  int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
390  int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
391  int index_in = 0;
392  if ((xIndex < width) && (yIndex < nsamp)) {
393  index_in = xIndex + (yIndex)*width;
394  // convert from unsigned to signed
395  tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
396  }
397  __syncthreads();
398 
399  xIndex = blockIdx.y * tile_dimy + threadIdx.x;
400  yIndex = blockIdx.x * tile_dimx + threadIdx.y;
401  int index_out = xIndex + (yIndex)*height;
402  // convert to float
403  odata[index_out] = (float)tile[threadIdx.x + blockDim.x * threadIdx.y];
404 }
405 
406 //
407 // __global__ void transposePreBin(float *odata, unsigned char *idata,
408 // int width, int height,
409 // int nsamp, int prebin)
410 //
411 // Executes the transpose of input data and convert it from unsigned to
412 // float. If the prebinning value > 1, it executes also the data binning.
413 //
414 // odata the output data matrix
415 // idata the input data matrix
416 // in_height the input matrix number of rows (heigth) equal to the
417 // number of freq. channels
418 // out_width the outout matrix number of cols (width) equal to the
419 // pitch dimension to get aligned memory data
420 // nsamp the number of samples per sub-integration and per channel
421 // prebin the number of samples averaged as prebinning (1-8 power of 2)
422 //
423 __global__ void transposePreBin(float *odata, unsigned char *idata, int in_height,
424  int out_width, int nsamp, int prebin)
425 {
426  extern __shared__ char tile[]; //shared memory size allocated dinamically
427  int tile_dimx = blockDim.x; // number of threads in x direction
428  int tile_dimy = blockDim.y; // number of threads in y direction
429  //int prebin_index = threadIdx.x % prebin; // index inside prebinning group
430  // OSS: prebin is a power of 2 so modulo operation can be replaced
431  // with the next one
432  int prebin_index = threadIdx.x & (prebin - 1); // index inside prebinning group
433  float ftemp = 0.; // temporary float well
434 
435  // null the shared memory
436  tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
437  __syncthreads();
438 
439  int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
440  int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
441  int index_in = 0;
442  if ((xIndex < in_height) && (yIndex < nsamp)) {
443  index_in = xIndex + (yIndex)*in_height;
444  // convert from unsigned to signed
445  tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
446  }
447  __syncthreads();
448 
449  // we only proceed if first thread of prebinning group
450 
451  if (prebin_index == 0) {
452  xIndex = blockIdx.y * tile_dimy + threadIdx.x;
453  yIndex = blockIdx.x * tile_dimx + threadIdx.y;
454  //int index_out = xIndex/prebin + (yIndex)*height/prebin;
455  int index_out = xIndex/prebin + (yIndex)*out_width;
456  // convert to float and sum up
457  for (int i = 0 ; i < prebin ; i++) {
458  ftemp += (float)tile[threadIdx.x + i + blockDim.x * (threadIdx.y)];
459  }
460  odata[index_out] = ftemp/prebin;
461  }
462  return;
463 }
464 } //cuda
465 } //fldo
466 } //cheetah
467 } //ska
Some limits and constants for FLDO.
Definition: Brdz.h:35