63 __global__
void input_converter(
unsigned char *g_in,
float *g_out,
int pitch_dim,
64 int local_measures,
int nchan)
66 int i = threadIdx.x + blockIdx.x * blockDim.x;
67 int j = threadIdx.y + blockIdx.y * blockDim.y;
69 if ((i < local_measures) && (j < nchan)) {
70 g_out[i + pitch_dim * j] = (float)g_in[i + pitch_dim *j] - 128;
84 __global__
void normalize_kernel(
float *folded,
float *weight,
int ncand,
85 float mean,
int nsubbands)
88 int idx0 = threadIdx.x + blockIdx.x*blockDim.x * nsubbands + ncand *
89 gridDim.x * blockDim.x * nsubbands;
105 for (n = 0; n < nsubbands; n ++) {
106 sub_idx = n * blockDim.x;
107 idx = idx0 + sub_idx;
108 counts = weight[idx];
110 if (counts > FLT_EPSILON) {
112 profile = folded[idx];
113 profile = profile/counts - mean;
115 folded[idx] = profile;
139 __global__
void profile_kernel(
float *in,
float *out,
int ncand,
int nloop)
141 int idx0 = threadIdx.x + blockIdx.x * blockDim.x * nloop +
142 ncand * blockDim.x * gridDim.x * nloop;
157 for (
int n = 0; n < nloop; n ++) {
158 sub_idx = n* blockDim.x;
159 idx = idx0 + sub_idx;
161 ftscrunch += profile;
163 out[threadIdx.x + blockIdx.x * blockDim.x + ncand * blockDim.x*gridDim.x] =
178 __global__
void best_profile_kernel(
float *in,
float *out,
int ncand,
179 int nloop,
int index,
int ntrial)
181 int idx0 = threadIdx.x + ncand * blockDim.x * nloop;
191 for (
int n = 0; n < nloop; n ++) {
192 sub_idx = n* blockDim.x;
193 idx = idx0 + sub_idx;
195 ftscrunch += profile;
197 out[threadIdx.x + index * blockDim.x + ncand * blockDim.x * ntrial] = ftscrunch/nloop;
219 __global__
void perturbate_kernel(
float *g_in,
float *g_out,
220 float*phase_shift,
int candidate,
221 int max_phases,
int index)
230 int subband_idx = threadIdx.y + blockIdx.x * blockDim.y;
232 int nsubbands = gridDim.x * blockDim.y;
234 int start_phase = max_phases * subband_idx + blockIdx.y * max_phases * gridDim.x * blockDim.y +
235 + candidate * max_phases *gridDim.x * blockDim.y * gridDim.y;
237 float ph_shift = phase_shift[subband_idx + blockIdx.y * nsubbands + candidate * nsubbands * gridDim.y];
238 int nbins = d_nbins[candidate];
239 int bin_off = (int)(ph_shift * nbins + 0.5);
240 extern __shared__
float indata[];
242 for (
int idx = threadIdx.x; idx < max_phases; idx += blockDim.x) {
244 g_out[idx + start_phase] = 0;
245 indata[threadIdx.x] = 0;
248 for (
int idx = threadIdx.x; idx < nbins; idx += blockDim.x) {
249 ibin = (idx + bin_off) % nbins;
250 indata[idx + max_phases * threadIdx.y] = g_in[start_phase + ibin];
251 g_out[idx + start_phase] = indata[idx + max_phases * threadIdx.y];
299 __global__
void compute_sn(
float* profile,
float* best,
int max_phases,
int ncand,
int nelem,
float sigma)
308 int nelem_per_block = blockDim.x;
312 float b_sn = - FLT_MAX;
315 int tid = threadIdx.x;
316 int gid = threadIdx.x + blockIdx.x * blockDim.x;
317 int start = gid * max_phases + ncand * nelem * max_phases;
318 extern float __shared__ result[];
320 int nbins = d_nbins[ncand];
321 _sigma = sigma * sqrt((
float)nbins);
328 for (width = 1; width < nbins >> 1; width ++) {
330 for (
int ibin = 0; ibin < width; ibin++){
331 curval += profile[start + ibin % nbins];
340 for (
int ibin = 0; ibin < nbins; ibin++){
341 int addbin = ibin + width;
342 curval -= profile[ start + ibin % nbins];
343 curval += profile[ start + addbin % nbins];
344 if (maxval < curval) {
348 sn = maxval /sqrt((
float)width)/_sigma;
357 result[tid + nelem_per_block] = b_width/(d_nu[ncand] * nbins);
360 best[gid + ncand * nelem * 2] = result[tid];
361 best[gid + nelem + ncand * nelem * 2] = result[tid + nelem_per_block];
378 __global__
void transposeNoBankConflicts(
float *odata,
unsigned char *idata,
379 int width,
int height,
int nsamp)
381 extern __shared__
char tile[];
382 int tile_dimx = blockDim.x;
383 int tile_dimy = blockDim.y;
386 tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
389 int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
390 int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
392 if ((xIndex < width) && (yIndex < nsamp)) {
393 index_in = xIndex + (yIndex)*width;
395 tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
399 xIndex = blockIdx.y * tile_dimy + threadIdx.x;
400 yIndex = blockIdx.x * tile_dimx + threadIdx.y;
401 int index_out = xIndex + (yIndex)*height;
403 odata[index_out] = (float)tile[threadIdx.x + blockDim.x * threadIdx.y];
423 __global__
void transposePreBin(
float *odata,
unsigned char *idata,
int in_height,
424 int out_width,
int nsamp,
int prebin)
426 extern __shared__
char tile[];
427 int tile_dimx = blockDim.x;
428 int tile_dimy = blockDim.y;
432 int prebin_index = threadIdx.x & (prebin - 1);
436 tile[threadIdx.x + blockDim.x * threadIdx.y] = 0;
439 int xIndex = blockIdx.x * tile_dimx + threadIdx.x;
440 int yIndex = blockIdx.y * tile_dimy + threadIdx.y;
442 if ((xIndex < in_height) && (yIndex < nsamp)) {
443 index_in = xIndex + (yIndex)*in_height;
445 tile[threadIdx.y + blockDim.y * threadIdx.x] = idata[index_in] - 128;
451 if (prebin_index == 0) {
452 xIndex = blockIdx.y * tile_dimy + threadIdx.x;
453 yIndex = blockIdx.x * tile_dimx + threadIdx.y;
455 int index_out = xIndex/prebin + (yIndex)*out_width;
457 for (
int i = 0 ; i < prebin ; i++) {
458 ftemp += (float)tile[threadIdx.x + i + blockDim.x * (threadIdx.y)];
460 odata[index_out] = ftemp/prebin;
Some limits and constants for FLDO.