Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
RebinInputDataKernel.cu
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 
26 #include <float.h>
27 
28 namespace ska {
29 namespace cheetah {
30 namespace fldo {
31 namespace cuda {
32 //__global__ void binInputKernel(float *odata, float *idata, int height, int in_width,
33 // int out_width, int prebin)
34 //
35 //
36 // odata the output data matrix
37 // idata the input data matrix
38 // in_height the input matrix number of rows (height = num of freq. channels)
39 // in_width the input matrix number of columns (width) equal to the input pitch_dim
40 // out_width the output matrix number of columns (width) equal to the output pitch_dim
41 // prebin the rebinning factor (2, 4, 8 or 16)
42 //
43 //
44 // This kernel executes the binning of the input matrix.
45 // When this kernel is executed, the input matrix has been already transposed. In
46 // this case:
47 // x direction -> time samples
48 // y direction -> freq. channels
49 // The input matrix and output matrix have the same height (4096 channels)
50 // and different widths. The two widths are not in the 2:1 proportion
51 // because the dimensionis in the row direction of the in and out matrix are aligned to get
52 // coalesced i/o operations.
53 
54 __global__ void binInputKernel(float *odata, float *idata, int in_height, int in_width,
55  int out_width, int prebin)
56 
57 {
58  extern __shared__ float Tile[]; //shared memory size allocated dinamically
59  int Tile_dimx = blockDim.x; // number of threads in x direction
60  int Tile_dimy = blockDim.y; // number of threads in y direction
61  //int prebin_index = threadIdx.x % prebin; // index inside prebinning group
62  // OSS: prebin is a power of 2 so modulo operation can be replaced with
63  // the next one
64  int prebin_index = threadIdx.x & (prebin - 1); // index inside prebinning group
65  float ftemp = 0.; // temporary float well
66 
67  // null the shared memory
68  Tile[threadIdx.x + blockDim.x * threadIdx.y] = 0.;
69  __syncthreads();
70 
71  int xIndex = blockIdx.x * Tile_dimx + threadIdx.x;
72  int yIndex = blockIdx.y * Tile_dimy + threadIdx.y;
73  int index_in = 0;
74  //load data in shared memory
75  if ((xIndex < in_width) && (yIndex < in_height)) {
76  index_in = xIndex + (yIndex) * in_width;
77  Tile[threadIdx.x+ blockDim.x * threadIdx.y] = idata[index_in] ;
78  }
79  __syncthreads();
80 
81  // we only proceed if first thread of prebinning group
82 
83  int index_out = 0;
84  if (prebin_index == 0) {
85  index_out = xIndex/prebin + (yIndex)*out_width;
86  // convert to float and sum up
87  for (int i = 0 ; i < prebin ; i++) {
88  ftemp += Tile[threadIdx.x + i + blockDim.x * threadIdx.y];
89  }
90  odata[index_out] = ftemp/prebin;
91  }
92  return;
93 }
94 
95 //
96 // __global__ void transposePreBin_shfl(float *odata, unsigned char *idata, int in_height,
97 // int out_width, int nsamp, int prebin)
98 //
99 // odata the output data matrix
100 // idata the input data matrix
101 // in_height the input matrix number of rows (i.e. matrix height) equal to the
102 // number of freq. channels
103 // out_width the output matrix number of cols (i.e. matrix width) equal to the
104 // pitch dimension to get aligned memory data
105 // nsamp the number of samples per sub-integration and per channel
106 // (<= than input height = rebin[0].pitch_dim)
107 // prebin the prebin factor
108 //
109 // Executes the transpose of input data and convert it from unsigned to float.
110 // The rebinning of the input matrix is done using the CUDA _shfl_up
111 // function.
112 //
113 //
114 __global__ void transposePreBin_shfl(float *odata, unsigned char *idata, int in_height,
115  int out_width, int nsamp, int prebin)
116 {
117  extern __shared__ char tile[]; //shared memory size allocated dinamically
118  int tile_dimx = blockDim.x; // number of threads in x direction
119  int tile_dimy = blockDim.y; // number of threads in y direction
120  int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
121  int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
122  int id = (gridDim.x * blockDim.x) * yIndex + xIndex; //the global thread id
123  int lane_id = id % 32;
124  float value = 0.;
125 
126  // null the shared memory
127  tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
128  __syncthreads();
129  int index_in = xIndex + (yIndex)*in_height;
130  //load data into shared memory
131  if( (xIndex < in_height) && (yIndex < nsamp)) {
132  tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
133  }
134  __syncthreads();
135  //rebin data using shfl_up
136  value = (float)tile[threadIdx.x + blockDim.x * (threadIdx.y)];
137  for (int i= 1; i<= prebin/2; i*= 2) {
138 #if __CUDACC_VER_MAJOR__ >= 9
139  float n = __shfl_up_sync(0xFFFFFFFF,value, i, 32);
140 #else
141  float n = __shfl_up(value, i, 32);
142 #endif
143  if (lane_id >= i) {
144  value += n;
145  }
146  }
147  //store the rebinned values into the output array
148  if (((lane_id + 1) & (prebin - 1)) == 0) {
149  xIndex = blockIdx.y * tile_dimy + threadIdx.x;
150  yIndex = blockIdx.x * tile_dimx + threadIdx.y;
151  int index_out = xIndex/prebin + (yIndex)*out_width;
152  odata[index_out] = value/prebin;
153  }
154  return;
155 }
156 
157 
158 //__global__ void binInputKernel_shfl(float *odata, float *idata, int in_height,
159 // int in_width, int out_width, int prebin)
160 //
161 // odata the output data matrix
162 // idata the input data matrix
163 // in_width the input matrix number of columns (width) equal to the input pitch_ dim
164 // out_width the output matrix number of columns (width) equal to the output pitch_dim
165 // prebin the rebinning factor (2, 4, 8 or 16)
166 //
167 //
168 // This kernel executes the binning of the input matrix.
169 // When this kernel is executed, the input matrix hase been already transposed. In
170 // this case:
171 // x direction -> time samples
172 // y direction -> freq. channels
173 // The input matrix and output matrix have the same height (4096 channels)
174 // and different widths. The two widths are not in the 2:1 proportion
175 // because the row dimension of the in and out matrix is aligned to get
176 // coalesced i/o operations.
177 //
178 // The rebinning of the input matrix is done using the CUDA _shfl_up
179 // function.
180 //
181 __global__ void binInputKernel_shfl(float *odata, float *idata, int in_width,
182  int out_width, int prebin)
183 
184 {
185  float value = 0.; // input value
186  int xIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
187  int yIndex = (blockIdx.y * blockDim.y) + threadIdx.y;
188  int id = (gridDim.x * blockDim.x) * yIndex + xIndex; //the global thread id
189  int lane_id = id % 32;
190 
191  //index of the element inside the indata array
192  int index_in = xIndex + (yIndex) * in_width;
193  value = idata[index_in] ;
194 
195  // sum prebin values
196  for (int i = 1; i <= prebin/2; i*= 2) {
197 #if __CUDACC_VER_MAJOR__ >= 9
198  float n = __shfl_up_sync(0xFFFFFFFF, value, i, 32);
199 #else
200  float n = __shfl_up(value, i, 32);
201 #endif
202  if (lane_id >= i) {
203  value += n;
204  }
205  }
206  //index of the element inside the out data array
207  int index_out = xIndex/prebin + yIndex * out_width;
208  if (((lane_id + 1) % prebin) == 0) {
209  odata[index_out] = value/prebin;
210  }
211  return;
212 }
213 } //cuda
214 } //fldo
215 } //cheetah
216 } //ska
Some limits and constants for FLDO.
Definition: Brdz.h:35