Cheetah - SKA - PSS - Prototype Time Domain Search Pipeline
FftWorker.cpp
1 /*
2  * The MIT License (MIT)
3  *
4  * Copyright (c) 2016 The SKA organisation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "cheetah/fft/altera/detail/FftWorker.h"
25 #include "cheetah/fft/altera/Fft.h"
26 #include "panda/Error.h"
27 #include "panda/Log.h"
28 #include "panda/Copy.h"
29 #ifdef ENABLE_OPENCL
30 #include "panda/arch/altera/DevicePointer.h"
31 #include "panda/arch/altera/DeviceCopy.h"
32 #endif // ENABLE_OPENCL
33 #include <cmath>
34 
35 
36 namespace ska {
37 namespace cheetah {
38 namespace fft {
39 namespace altera {
40 
41 #ifdef ENABLE_OPENCL
42 template <typename T, typename InputAlloc, typename OutputAlloc>
43 void FftWorker::operator()(data::TimeSeries<cheetah::Fpga, T, InputAlloc> const& input
44  , data::FrequencySeries<cheetah::Fpga, FftWorker::Complex<T>
45  , OutputAlloc>& output)
46 {
47  auto const& twiddles=_cxft.eight_million_point().twiddles();
48  if(input.size() > twiddles.size()){
49  panda::Error e("Invalid FFT size! Kernel compiled for the size: ");
50  e << twiddles.size();
51  throw e;
52  }
53  size_t half_fft_size=twiddles.size()/2, sqrt_half_fft=int(sqrt(half_fft_size));
54 
55  panda::altera::DevicePointer<Complex<T>> dev_in_ev(_device, half_fft_size, *_first_queue);
56  panda::altera::DevicePointer<Complex<T>> dev_in_od(_device, half_fft_size, *_first_queue);
57  panda::altera::DevicePointer<Complex<T>> dev_temp(_device
58  , half_fft_size,*_data_queue);
59  panda::altera::DevicePointer<Complex<T>> dev_sig_ev(_device
60  , half_fft_size, *_data_queue);
61  panda::altera::DevicePointer<Complex<T>> dev_sig_od(_device
62  , half_fft_size, *_data_queue);
63  panda::altera::DevicePointer<Complex<T>> dev_twd(_device
64  , twiddles.size(), *_last_queue);
65  panda::copy(twiddles.cbegin(), twiddles.cend(), dev_twd.begin());
66 
67  /*
68  * @details: first stage separates even & odd samples of data,
69  * explicit finishing of the _first_queue makes even & odd input buffers available to the FFT kernels
70  */
71  (*_first_kernel)(*_first_queue, 1, 1, static_cast<cl_mem>(input.begin())
72  , (&*dev_in_ev), (&*dev_in_od), twiddles.size());
73  (*_first_queue).finish();
74 
75  /*
76  * @details: FFT kernels for 4M samples called for four times (2 loops)
77  * even FFT, odd FFT consisting of row operation+column operations
78  * even samples j=0; odd samples j=1; i=0 row FFT; i=1 column FFT
79  * explicit finishing of the _transpose_queue makes even & odd output buffers available to the last stage
80  * multiwire archietecture makes this FFT faster
81  */
82  cl_int mangle_int=0, twidle_int=1, inverse_int=0;
83  cl_uint rows_arg=sqrt_half_fft, columns_arg=sqrt_half_fft;
84  cl_int log_rows_arg=log2(sqrt_half_fft), log_columns_arg=log2(sqrt_half_fft);
85  int columns = (1 << log_columns_arg);
86  int rows = (1 << log_rows_arg);
87  float delta_const = -2.0f * (float)M_PI / (columns * rows);
88  for (int j = 0; j < 2; ++j){
89  for (int i = 0; i < 2; ++i){
90  twidle_int = !i;
91  (*_fetch_kernel)(*_fetch_queue, 1, 1
92  , i == 0 ? (j == 0 ?(&*dev_in_ev):(&*dev_in_od)) : (&*dev_temp)
93  , mangle_int, twidle_int
94  , log_rows_arg, log_columns_arg, rows_arg, columns_arg);
95  (*_fetch_mwt_kernel)(*_fetch_mwt_queue,1, 1, log_rows_arg, log_columns_arg, twidle_int);
96  (*_fft_kernel)(*_fft_queue,1, 1, inverse_int
97  , log_rows_arg, log_columns_arg, rows_arg, columns_arg);
98  (*_transpose_mwt_kernel)(*_transpose_mwt_queue, 1, 1, log_rows_arg, log_columns_arg);
99  (*_transpose_kernel)(*_transpose_queue, 1, 1
100  , i == 0 ? (&*dev_temp) : (j == 0 ? (&*dev_sig_ev) :(&*dev_sig_od))
101  , mangle_int, twidle_int, inverse_int
102  , log_rows_arg, log_columns_arg, rows_arg, columns_arg, delta_const);
103  }
104  }
105  (*_transpose_mwt_queue).finish();
106  (*_transpose_queue).finish();
107 
108  /*
109  * @details: last stage implements twiddle multiplications for 8M data,
110  * it doesn't compute one side of the spectrum for real to complex transform, saving resources
111  */
112  (*_last_kernel)(*&(_device.default_queue()), 1, 1
113  , (&*dev_sig_ev), (&*dev_sig_od), (&*dev_twd)
114  , static_cast<cl_mem>(output.begin()), half_fft_size);
115 
116 }
117 #else // ENABLE_OPENCL
118 template <typename T, typename InputAlloc, typename OutputAlloc>
119 void FftWorker::operator()(data::TimeSeries<cheetah::Fpga , T, InputAlloc> const&,
120  data::FrequencySeries<cheetah::Fpga, FftWorker::Complex<T>, OutputAlloc>&)
121 {
122 }
123 #endif // ENABLE_OPENCL
124 
125 } // namespace altera
126 } // namespace fft
127 } // namespace cheetah
128 } // namespace ska
Some limits and constants for FLDO.
Definition: Brdz.h:35