24 #include "cheetah/fldo/cuda/detail/FldoUtils.h" 49 void fold_input_data(cudaDeviceProp gpu_properties,
float *d_folded,
float *d_weight,
int *nbins,
50 CandidateRebin
const &rebin,
int ncand,
int isubint,
int nchannels,
int nsubbands,
51 uint64_t nsamp_subslot,
int default_max_phase,
double tobs,
bool enable_split,
52 cudaStream_t exec_stream)
59 size_t warp_size = gpu_properties.warpSize;
60 size_t max_threads_per_block = gpu_properties.maxThreadsPerBlock;
61 size_t shared_mem_per_block = gpu_properties.sharedMemPerBlock;
62 PANDA_LOG_DEBUG <<
"shared_mem_per_block: " 63 << shared_mem_per_block
68 threadsPerBlock.x = 32;
69 if (threadsPerBlock.x > (
size_t)rebin.pitch_dim) {
70 threadsPerBlock.x = rebin.pitch_dim;
74 threadsPerBlock.y = 8;
75 if (nchannels/nsubbands < 8) {
76 threadsPerBlock.y = nchannels/nsubbands;
78 blocksPerGrid.x = nsubbands;
81 size_t max_threads_x = max_threads_per_block/threadsPerBlock.y;
83 if (threadsPerBlock.x > max_threads_x) {
84 threadsPerBlock.x = max_threads_x;
88 if ((rebin.pitch_dim % threadsPerBlock.x) != 0) {
89 if (threadsPerBlock.x * threadsPerBlock.y < max_threads_per_block) {
90 threadsPerBlock.x += 1;
93 std::stringstream error_msg;
94 error_msg <<
"folding kernel: Invalid kernel parameters: threads for each SM: " 95 << threadsPerBlock.x * threadsPerBlock.y
97 << max_threads_per_block
99 PANDA_LOG_ERROR << error_msg.str();
100 throw panda::Error(error_msg.str());
103 int warp_count = threadsPerBlock.x * threadsPerBlock.y/warp_size;
104 PANDA_LOG_DEBUG <<
"warp_count : " << warp_count;
105 if (((threadsPerBlock.x * threadsPerBlock.y) % warp_size) != 0) {
108 int max_phase = default_max_phase;
110 if (nbins[ncand] < 15) {
112 }
else if (nbins[ncand] < 31) {
114 }
else if (nbins[ncand] < 63) {
120 size_t shared_memory_size = max_phase * warp_count *
sizeof(float) * 2;
121 if (shared_memory_size > shared_mem_per_block) {
122 std::stringstream err_msg;
123 err_msg <<
"Shared memory requested size is too big (requested: " 124 << shared_memory_size
126 << shared_mem_per_block
128 PANDA_LOG_ERROR << err_msg.str();
129 throw panda::Error(err_msg.str());
131 int threadblock_memory = max_phase * warp_count;
135 if (enable_split ==
true) {
136 folding_worker<<< blocksPerGrid, threadsPerBlock, shared_memory_size, exec_stream >>>
146 tobs, default_max_phase, isubint,
149 folding_worker_nosplit<<< blocksPerGrid, threadsPerBlock, shared_memory_size, exec_stream >>>
164 CUDA_ERROR_CHECK(cudaGetLastError());
Some limits and constants for FLDO.