Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
FoldInputData.cu
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "cheetah/fldo/cuda/detail/FldoUtils.h"
25 //#include "cheetah/cuda_utils/cuda_errorhandling.h"
26 
27 namespace ska {
28 namespace cheetah {
29 namespace fldo {
30 namespace cuda {
31 namespace util {
32 
49 void fold_input_data(cudaDeviceProp gpu_properties, float *d_folded, float *d_weight, int *nbins,
50  CandidateRebin const &rebin, int ncand, int isubint, int nchannels, int nsubbands,
51  uint64_t nsamp_subslot, int default_max_phase, double tobs, bool enable_split,
52  cudaStream_t exec_stream)
53 {
54  dim3 threadsPerBlock; // number of kernel threads per block
55  dim3 blocksPerGrid; // number of kernel blocks for grid
56 
57  // get some properties of the device. These info are used during the
58  // configuration of GPU kernel launch.
59  size_t warp_size = gpu_properties.warpSize;
60  size_t max_threads_per_block = gpu_properties.maxThreadsPerBlock;
61  size_t shared_mem_per_block = gpu_properties.sharedMemPerBlock;
62  PANDA_LOG_DEBUG << "shared_mem_per_block: "
63  << shared_mem_per_block
64  << " warp_size: "
65  << warp_size;
66 
67  //threads in x represents the steps in time so threadsPerBlock.x < nsamp_subslot
68  threadsPerBlock.x = 32;
69  if (threadsPerBlock.x > (size_t)rebin.pitch_dim) {
70  threadsPerBlock.x = rebin.pitch_dim; // should not happen!!
71  }
72  //threads in y represents the steps in freqs inside each band. So
73  //threadsPerBlock.y <= number of chans/band
74  threadsPerBlock.y = 8; // good value (32 x 8 = 256 threads/block)
75  if (nchannels/nsubbands < 8) {
76  threadsPerBlock.y = nchannels/nsubbands;
77  }
78  blocksPerGrid.x = nsubbands;
79  blocksPerGrid.y = 1;
80  // calculate the max number of threads in x
81  size_t max_threads_x = max_threads_per_block/threadsPerBlock.y;
82 
83  if (threadsPerBlock.x > max_threads_x) {
84  threadsPerBlock.x = max_threads_x;
85  }
86  // take into account that the number of nsamp in each integration
87  // is not a multiple of the thread dimension in x
88  if ((rebin.pitch_dim % threadsPerBlock.x) != 0) {
89  if (threadsPerBlock.x * threadsPerBlock.y < max_threads_per_block) {
90  threadsPerBlock.x += 1;
91  } else {
92  // too much threads for SM: can't run kernel
93  std::stringstream error_msg;
94  error_msg << "folding kernel: Invalid kernel parameters: threads for each SM: "
95  << threadsPerBlock.x * threadsPerBlock.y
96  << "(max: "
97  << max_threads_per_block
98  << ")";
99  PANDA_LOG_ERROR << error_msg.str();
100  throw panda::Error(error_msg.str());
101  }
102  }
103  int warp_count = threadsPerBlock.x * threadsPerBlock.y/warp_size;
104  PANDA_LOG_DEBUG << "warp_count : " << warp_count;
105  if (((threadsPerBlock.x * threadsPerBlock.y) % warp_size) != 0) {
106  warp_count += 1;
107  }
108  int max_phase = default_max_phase; //initiliaze the default value of phase bins
109  //adjust the max_phase value on the nbins of the current candidate
110  if (nbins[ncand] < 15) {
111  max_phase = 16;
112  } else if (nbins[ncand] < 31) {
113  max_phase = 32;
114  } else if (nbins[ncand] < 63) {
115  max_phase = 64;
116  }
117  // configure shared memory size
118  // OSS: factor 2 because we store in shared memory the partial folded data
119  // and phases weights
120  size_t shared_memory_size = max_phase * warp_count * sizeof(float) * 2;
121  if (shared_memory_size > shared_mem_per_block) {
122  std::stringstream err_msg;
123  err_msg << "Shared memory requested size is too big (requested: "
124  << shared_memory_size
125  << " available: "
126  << shared_mem_per_block
127  << ")";
128  PANDA_LOG_ERROR << err_msg.str();
129  throw panda::Error(err_msg.str());
130  }
131  int threadblock_memory = max_phase * warp_count;
132  //the folding kernel is executed on a different stream for group of candidate.
133  //This kernel starts only when the data rebinning of the input data
134  //(with the rebin of the current candidate) has ended
135  if (enable_split == true) {
136  folding_worker<<< blocksPerGrid, threadsPerBlock, shared_memory_size, exec_stream >>>
137  (rebin.d_out,
138  d_folded,
139  d_weight,
140  rebin.pitch_dim,
141  (int)nsamp_subslot,
142  ncand,
143  rebin.rebin,
144  max_phase,
145  warp_count,
146  tobs, default_max_phase, isubint,
147  threadblock_memory);
148  } else {
149  folding_worker_nosplit<<< blocksPerGrid, threadsPerBlock, shared_memory_size, exec_stream >>>
150  (rebin.d_out,
151  d_folded,
152  d_weight,
153  rebin.pitch_dim,
154  (int)nsamp_subslot,
155  ncand,
156  rebin.rebin,
157  max_phase,
158  warp_count,
159  tobs,
160  default_max_phase,
161  isubint,
162  threadblock_memory);
163  }
164  CUDA_ERROR_CHECK(cudaGetLastError());
165 }
166 
167 } // utils
168 } // namespace cuda
169 } // namespace fldo
170 } // namespace cheetah
171 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35