29 #include <cuda_runtime.h> 36 template <
typename InputNumericalT,
typename OutputNumericalT>
37 __global__
void simple_corner_turn_kernel(
const InputNumericalT *d_input, OutputNumericalT *d_output, std::size_t first_dimension, std::size_t second_dimension)
39 int t = blockIdx.x * blockDim.x + threadIdx.x;
40 if(t < first_dimension*second_dimension)
42 int c = t%first_dimension;
43 int s = t/first_dimension;
44 d_output[c * second_dimension + s] = (OutputNumericalT)(d_input[s * first_dimension + c]);
Some limits and constants for FLDO.