54 __global__
void binInputKernel(
float *odata,
float *idata,
int in_height,
int in_width,
55 int out_width,
int prebin)
58 extern __shared__
float Tile[];
59 int Tile_dimx = blockDim.x;
60 int Tile_dimy = blockDim.y;
64 int prebin_index = threadIdx.x & (prebin - 1);
68 Tile[threadIdx.x + blockDim.x * threadIdx.y] = 0.;
71 int xIndex = blockIdx.x * Tile_dimx + threadIdx.x;
72 int yIndex = blockIdx.y * Tile_dimy + threadIdx.y;
75 if ((xIndex < in_width) && (yIndex < in_height)) {
76 index_in = xIndex + (yIndex) * in_width;
77 Tile[threadIdx.x+ blockDim.x * threadIdx.y] = idata[index_in] ;
84 if (prebin_index == 0) {
85 index_out = xIndex/prebin + (yIndex)*out_width;
87 for (
int i = 0 ; i < prebin ; i++) {
88 ftemp += Tile[threadIdx.x + i + blockDim.x * threadIdx.y];
90 odata[index_out] = ftemp/prebin;
114 __global__
void transposePreBin_shfl(
float *odata,
unsigned char *idata,
int in_height,
115 int out_width,
int nsamp,
int prebin)
117 extern __shared__
char tile[];
118 int tile_dimx = blockDim.x;
119 int tile_dimy = blockDim.y;
120 int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
121 int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
122 int id = (gridDim.x * blockDim.x) * yIndex + xIndex;
123 int lane_id =
id % 32;
127 tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
129 int index_in = xIndex + (yIndex)*in_height;
131 if( (xIndex < in_height) && (yIndex < nsamp)) {
132 tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
136 value = (float)tile[threadIdx.x + blockDim.x * (threadIdx.y)];
137 for (
int i= 1; i<= prebin/2; i*= 2) {
138 #if __CUDACC_VER_MAJOR__ >= 9 139 float n = __shfl_up_sync(0xFFFFFFFF,value, i, 32);
141 float n = __shfl_up(value, i, 32);
148 if (((lane_id + 1) & (prebin - 1)) == 0) {
149 xIndex = blockIdx.y * tile_dimy + threadIdx.x;
150 yIndex = blockIdx.x * tile_dimx + threadIdx.y;
151 int index_out = xIndex/prebin + (yIndex)*out_width;
152 odata[index_out] = value/prebin;
181 __global__
void binInputKernel_shfl(
float *odata,
float *idata,
int in_width,
182 int out_width,
int prebin)
186 int xIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
187 int yIndex = (blockIdx.y * blockDim.y) + threadIdx.y;
188 int id = (gridDim.x * blockDim.x) * yIndex + xIndex;
189 int lane_id =
id % 32;
192 int index_in = xIndex + (yIndex) * in_width;
193 value = idata[index_in] ;
196 for (
int i = 1; i <= prebin/2; i*= 2) {
197 #if __CUDACC_VER_MAJOR__ >= 9 198 float n = __shfl_up_sync(0xFFFFFFFF, value, i, 32);
200 float n = __shfl_up(value, i, 32);
207 int index_out = xIndex/prebin + yIndex * out_width;
208 if (((lane_id + 1) % prebin) == 0) {
209 odata[index_out] = value/prebin;
Some limits and constants for FLDO.