Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
CornerTurner.cu
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "cheetah/fldo/cuda/detail/FldoUtils.h"
25 
26 
27 namespace ska {
28 namespace cheetah {
29 namespace fldo {
30 namespace cuda {
31 namespace util {
32 
33 /*
34  *
35  * void corner_turner_and_rebin(cudaDeviceProp gpu_properties, int first_bin_idx, uint64_t nsamp_subslot,
36  * size_t nchannels, std::vector<util::CandidateRebin> &rebin,
37  * unsigned char *d_in)
38  *
39  * @brief Executes the corner-turning of the input data and eventually
40  * sum-up adjacent time sample (rebinning).
41  *
42  * @param gpu_properties the device properties
43  * @param first_bin_idx the first rebinning index
44  * @param nsamp_subslot the number of time samples/sub-integration
45  * @param ncahnnels the number of freq. channels
46  * @param rebin the vector with the rebinning structures
47  * @param d_in the input data
48  *
49  * @return On failure throws a runtime_error exception.
50  */
51 void corner_turner_and_rebin(cudaDeviceProp gpu_properties, int first_bin_idx, uint64_t nsamp_subslot,
52  size_t nchannels, std::vector<util::CandidateRebin> &rebin, unsigned char *d_in)
53 {
54  //dim3 threadsPerBlock; // number of kernel threads per block
55  //dim3 blocksPerGrid; // number of kernel blocks for grid
56 
57  // get some properties of the device. These info are used during the
58  // configuration of GPU kernel launch.
59  size_t shared_mem_per_block =gpu_properties.sharedMemPerBlock;
60  int first_bin_val = 1; // first valid rebinning value
61  // set kernel launch configuration
62  int tile_dimx = 32; //number of threads in x
63  int tile_dimy = 32; //number of threads in y
64 
65  try
66  {
67  // NB:
68  // The transpose operation has to be done on the original input
69  // matrix. It means that even if there is no candidate with rebin = 1,
70  // we have to take into account the original matrix dimension.
71  // The starting input data matrix has:
72  // nrows = _nchannels
73  // ncols = rebin[0].pitch_dim (>= nsamp_subslot)
74  // rebin[0].pitch_dim is the matrix row length returned by the call to
75  // cudaMallocPitch()
76  //
77  int nblock_x = (nchannels + (tile_dimx - 1))/tile_dimx;
78  int nblock_y = (rebin[0].pitch_dim + (tile_dimy - 1))/tile_dimy;
79  dim3 grid(nblock_x, nblock_y);
80  dim3 threads(tile_dimx, tile_dimy);
81 
82  //get the size of the shared memory
83  size_t shared_memory_size = tile_dimx * tile_dimy * sizeof(char);
84 
85  //check if the reserved shared memory size is less than the available
86  //one
87  if (shared_memory_size > shared_mem_per_block) {
88  std::stringstream error_msg;
89  error_msg << "Shared memory requested size is too big (requested: "
90  << shared_memory_size
91  << " available: "
92  << shared_mem_per_block;
93  throw std::runtime_error(error_msg.str());
94  }
95 
96  //get the rebinning value corresponding to theinput rebin_index
97  first_bin_val = rebin[first_bin_idx].rebin;
98 
99  PANDA_LOG_DEBUG << "corner_turner: first bin index: "
100  << first_bin_idx
101  << " val: "
102  << first_bin_val
103  << " nsamp_subslot: "
104  << nsamp_subslot;
105  // we handle the prebinning. (we hope it doesn't mess up!)
106  // trasposePreBin kernel executes the matrix data transpose. If the
107  // first binning value is > 1, it also rebins the input data.
108  // The kernel is executed in a separate stream from the default one.
109  // Next operations on data have to wait for the end of this
110  // kernel.
111 #ifdef TRANSPOSE_SHFL
112  if (first_bin_val > 1) {
113  PANDA_LOG_DEBUG << "Call to transpose_shfl kernel: threads: ("
114  << tile_dimx << ", " << tile_dimy
115  << ") blocks: (" << nblock_x << ", " << nblock_y
116  << ") rebin: " << first_bin_val;
117  transposePreBin_shfl<<<grid, threads, shared_memory_size, rebin[first_bin_idx].stream>>>
118  (rebin[first_bin_idx].d_out,
119  d_in,
120  nchannels,
121  rebin[first_bin_idx].pitch_dim,
122  (int)nsamp_subslot,
123  first_bin_val);
124  } else {
125  PANDA_LOG_DEBUG << "Call to transpose kernel: threads: ("
126  << tile_dimx << ", " << tile_dimy
127  << ") blocks: (" << nblock_x << ", " << nblock_y
128  << ") rebin: " << first_bin_val;
129  transposeNoBankConflicts<<<grid,threads, shared_memory_size, rebin[first_bin_idx].stream>>>
130  (rebin[first_bin_idx].d_out,
131  d_in,
132  nchannels,
133  rebin[first_bin_idx].pitch_dim,
134  (int)nsamp_subslot);
135  }
136 #else
137  PANDA_LOG_DEBUG << "Call to transposePreBin kernel: threads: ("
138  << tile_dimx << ", " << tile_dimy
139  << ") blocks: (" << nblock_x << ", " << nblock_y
140  << ") rebin: " << first_bin_val;
141  transposePreBin<<<grid,threads, shared_memory_size, rebin[first_bin_idx].stream>>>
142  (rebin[first_bin_idx].d_out,
143  d_in,
144  nchannels,
145  rebin[first_bin_idx].pitch_dim,
146  (int)nsamp_subslot,
147  first_bin_val);
148 #endif
149  CUDA_ERROR_CHECK(cudaGetLastError());
150  CUDA_ERROR_CHECK(cudaEventRecord(rebin[first_bin_idx].event, rebin[first_bin_idx].stream));
151  }
152  catch (std::runtime_error &e) {
153  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in corner_turn(): "
154  << e.what();
155  throw e;
156  }
157  catch (...) {
158  PANDA_LOG_ERROR << "Caught an exception of an unexpected type in corner_turn()";
159  throw panda::Error("Caught an exception of an unexpected type in corner_turn()");
160  }
161 
162 }
163 } // utils
164 } // namespace cuda
165 } // namespace fldo
166 } // namespace cheetah
167 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35