24 #include "cheetah/fldo/cuda/detail/FldoUtils.h" 51 void corner_turner_and_rebin(cudaDeviceProp gpu_properties,
int first_bin_idx, uint64_t nsamp_subslot,
52 size_t nchannels, std::vector<util::CandidateRebin> &rebin,
unsigned char *d_in)
59 size_t shared_mem_per_block =gpu_properties.sharedMemPerBlock;
60 int first_bin_val = 1;
77 int nblock_x = (nchannels + (tile_dimx - 1))/tile_dimx;
78 int nblock_y = (rebin[0].pitch_dim + (tile_dimy - 1))/tile_dimy;
79 dim3 grid(nblock_x, nblock_y);
80 dim3 threads(tile_dimx, tile_dimy);
83 size_t shared_memory_size = tile_dimx * tile_dimy *
sizeof(char);
87 if (shared_memory_size > shared_mem_per_block) {
88 std::stringstream error_msg;
89 error_msg <<
"Shared memory requested size is too big (requested: " 92 << shared_mem_per_block;
93 throw std::runtime_error(error_msg.str());
97 first_bin_val = rebin[first_bin_idx].rebin;
99 PANDA_LOG_DEBUG <<
"corner_turner: first bin index: " 103 <<
" nsamp_subslot: " 111 #ifdef TRANSPOSE_SHFL 112 if (first_bin_val > 1) {
113 PANDA_LOG_DEBUG <<
"Call to transpose_shfl kernel: threads: (" 114 << tile_dimx <<
", " << tile_dimy
115 <<
") blocks: (" << nblock_x <<
", " << nblock_y
116 <<
") rebin: " << first_bin_val;
117 transposePreBin_shfl<<<grid, threads, shared_memory_size, rebin[first_bin_idx].stream>>>
118 (rebin[first_bin_idx].d_out,
121 rebin[first_bin_idx].pitch_dim,
125 PANDA_LOG_DEBUG <<
"Call to transpose kernel: threads: (" 126 << tile_dimx <<
", " << tile_dimy
127 <<
") blocks: (" << nblock_x <<
", " << nblock_y
128 <<
") rebin: " << first_bin_val;
129 transposeNoBankConflicts<<<grid,threads, shared_memory_size, rebin[first_bin_idx].stream>>>
130 (rebin[first_bin_idx].d_out,
133 rebin[first_bin_idx].pitch_dim,
137 PANDA_LOG_DEBUG <<
"Call to transposePreBin kernel: threads: (" 138 << tile_dimx <<
", " << tile_dimy
139 <<
") blocks: (" << nblock_x <<
", " << nblock_y
140 <<
") rebin: " << first_bin_val;
141 transposePreBin<<<grid,threads, shared_memory_size, rebin[first_bin_idx].stream>>>
142 (rebin[first_bin_idx].d_out,
145 rebin[first_bin_idx].pitch_dim,
149 CUDA_ERROR_CHECK(cudaGetLastError());
150 CUDA_ERROR_CHECK(cudaEventRecord(rebin[first_bin_idx].event, rebin[first_bin_idx].stream));
152 catch (std::runtime_error &e) {
153 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in corner_turn(): " 158 PANDA_LOG_ERROR <<
"Caught an exception of an unexpected type in corner_turn()";
159 throw panda::Error(
"Caught an exception of an unexpected type in corner_turn()");
Some limits and constants for FLDO.