Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
CandidatesProfiles.cu
1 
2 /*
3  * The MIT License (MIT)
4  *
5  * Copyright (c) 2016 The SKA organisation
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in all
15  * copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 #include "cheetah/fldo/cuda/detail/FldoUtils.h"
26 
27 
28 namespace ska {
29 namespace cheetah {
30 namespace fldo {
31 namespace cuda {
32 namespace util {
33 
34 /*
35  * void build_scrunched_profiles(size_t ncandidates, size_t max_phases,
36  * size_t nsubbands, size_t nsubints, float mean, float *d_folded, float *d_weight,
37  * float *d_outfprof, float* d_outprof, std::vector<util::GpuStream>&exec_stream)
38  *
39  * @brief Normalizes the profiles produced by the folding algorithm and
40  * produces the reduced profiles for each pulsar candidate.
41  *
42  * @param ncandidates the number of pulsar candidates
43  * @param max_phases the default max number of phases (128 or 256)
44  * @param nsubbands the number of frequencies sub-bands
45  * @param nsubints the number of sub-integration in time
46  * @param mean the input data mean value
47  * @param d_folded device memeory with folded data for all candidates
48  * @param d_weight device memory withwieghts of phases
49  * @param d_outfprof device memory to store profiles scrunched in freq
50  * @param d_outprof device memory to store profile scrunched in freq and time
51  * @param exec_stream an array of gpu streams to run kernels concurrently
52  *
53  * @return On failure throws a runtime_error exception.
54  */
55 void build_scrunched_profiles(size_t ncandidates, size_t max_phases,
56  size_t nsubbands, size_t nsubints, float mean, float *d_folded, float *d_weight,
57  float *d_outfprof, float* d_outprof, std::vector<util::GpuStream>&exec_stream)
58 {
59  dim3 threadsPerBlock; //number of threads for each block
60  dim3 blocksPerGrid; //number of blocks for each grid
61  // calculdate the gpu shared memory size for the default max number of phases
62  int shared_memsize = max_phases * sizeof(float);
63  PANDA_LOG_DEBUG << "Calling kernel to build profiles: threads= ("
64  << threadsPerBlock.x
65  << ", "
66  << threadsPerBlock.y
67  << ") blocks= ("
68  << blocksPerGrid.x
69  << ", "
70  << blocksPerGrid.y
71  << ")";
72  PANDA_LOG_DEBUG << "raw time series mean: " << mean;
73  //loop on candidates: each candidate handled by a different CUDA stream
74  for (size_t ncand = 0; ncand < ncandidates; ++ncand) {
75  //associate a gpu stream to each candidate kernel
76  int nstream = ncand % exec_stream.size();
77  // launch normalization kernel
78  threadsPerBlock.x = max_phases; // one thread for each bin (phase)
79  threadsPerBlock.y = 1;
80  blocksPerGrid.x = nsubints; // one block for each sub-integration
81  blocksPerGrid.y = 1; // one block for each sub-integration
82  normalize_kernel<<<blocksPerGrid, threadsPerBlock, 0, exec_stream[nstream].stream() >>>
83  (d_folded,
84  d_weight,
85  ncand,
86  mean/nsubints,
87  nsubbands);
88  //check for kernel errors
89  CUDA_ERROR_CHECK(cudaGetLastError());
90  //launch kernel for profile
91  threadsPerBlock.x = max_phases; // one thread for each bin (phase)
92  threadsPerBlock.y = 1;
93  blocksPerGrid.x = nsubints; // one block for each sub-integration
94  blocksPerGrid.y = 1; // one block for each sub-integration
95  profile_kernel<<< blocksPerGrid, threadsPerBlock, shared_memsize, exec_stream[nstream].stream() >>>
96  (d_folded,
97  d_outfprof,
98  ncand,
99  nsubbands);
100  //check for kernel errors
101  CUDA_ERROR_CHECK(cudaGetLastError());
102  //reduce kernel for profile
103  threadsPerBlock.x = max_phases;
104  threadsPerBlock.y = 1;
105  blocksPerGrid.x = 1;
106  blocksPerGrid.y = 1;
107  profile_kernel<<< blocksPerGrid, threadsPerBlock, shared_memsize, exec_stream[nstream].stream() >>>
108  (d_outfprof,
109  d_outprof,
110  ncand,
111  nsubints);
112  //check for kernel errors
113  CUDA_ERROR_CHECK(cudaGetLastError());
114  }
115 }
116 
117 } // util
118 } // namespace cuda
119 } // namespace fldo
120 } // namespace cheetah
121 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35