| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H |
| #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H |
| |
| #if defined(EIGEN_USE_GPU) |
| |
| namespace Eigen { |
| namespace internal { |
| |
| template <typename OutExpr, typename InExpr, typename Op, typename Indices, |
| bool Tileable> |
| class TensorExecutor< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, false, Tileable> { |
| public: |
| typedef const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> |
| Expression; |
| static void run(const Expression& expr, const GpuDevice& device); |
| }; |
| |
| template <typename OutExpr, typename InExpr, typename Op, typename Indices, |
| bool Tileable> |
| class TensorExecutor< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, true, Tileable> { |
| public: |
| typedef const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> |
| Expression; |
| static void run(const Expression& expr, const GpuDevice& device); |
| }; |
| |
| template <typename InExpr, typename Op, typename Indices, bool Tileable> |
| class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> >, |
| GpuDevice, false, Tileable> { |
| public: |
| typedef const TensorEvalToOp< |
| const TensorReductionOp<Op, const Indices, const InExpr> > Expression; |
| static void run(const Expression& expr, const GpuDevice& device); |
| }; |
| |
| template <typename InExpr, typename Op, typename Indices, bool Tileable> |
| class TensorExecutor<const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> >, |
| GpuDevice, true, Tileable> { |
| public: |
| typedef const TensorEvalToOp< |
| const TensorReductionOp<Op, const Indices, const InExpr> > Expression; |
| static void run(const Expression& expr, const GpuDevice& device); |
| }; |
| |
| } // end namespace internal |
| } // end namespace Eigen |
| |
| #if defined(__CUDACC__) |
| |
| namespace Eigen { |
| |
| namespace internal { |
| |
| namespace { |
| |
| // Initialize output[0..size-1] with val |
| template <typename Output> |
| __global__ void InitVector(const float val, int size, Output output) { |
| int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| for (int i = idx; i < size; i += gridDim.x * blockDim.x) { |
| output.coeffRef(i) = val; |
| } |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // Column Reduction kernels |
| // ----------------------------------------------------------------------------- |
| template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input, |
| typename Output, typename Reducer> |
| __global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows, |
| int cols, Output output) { |
| assert(blockDim.x == BLOCK_DIM); |
| assert(blockDim.y == 1); |
| assert(blockDim.z == 1); |
| |
| assert(gridDim.x == GRID_DIM); |
| assert(gridDim.y == 1); |
| assert(gridDim.z == 1); |
| |
| typedef typename Input::Index Index; |
| |
| const Index num_input_points = divup(rows, NUM_PER_THREAD) * cols; |
| const int bx = blockIdx.x; |
| const int tx = threadIdx.x; |
| |
| for (Index i = bx * BLOCK_DIM + tx; i < num_input_points; |
| i += BLOCK_DIM * GRID_DIM) { |
| const Index input_col = i % cols; |
| const Index input_row_begin = |
| ((i / cols) % divup(rows, NUM_PER_THREAD)) * NUM_PER_THREAD; |
| float reduced_val = reducer.bottom_value(); |
| for (int j = 0; j < NUM_PER_THREAD; ++j) { |
| float val = ((input_col < cols) && (input_row_begin + j < rows)) |
| ? input.coeff((input_row_begin + j) * cols + input_col) |
| : reducer.bottom_value(); |
| reduced_val = reducer(reduced_val, val); |
| } |
| #if __CUDA_ARCH__ >= 300 |
| reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val); |
| #endif |
| } |
| } |
| |
| template <typename Input, typename Output, typename Reducer> |
| void ColumnReduceCuda(Reducer reducer, const GpuDevice& device, |
| const Input input, int rows, int cols, Output output) { |
| const int block_size = 256; |
| const int grid_size = 128; |
| const int num_per_thread = 16; |
| LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device, reducer.bottom_value(), |
| cols, output); |
| LAUNCH_CUDA_KERNEL( |
| (ColumnReduceKernel<grid_size, block_size, num_per_thread, Input, Output, Reducer>), grid_size, |
| block_size, 0, device, reducer, input, rows, cols, output); |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // Row Reduction kernel |
| // ----------------------------------------------------------------------------- |
| |
| // Template parameter to RowReduceKernel derived from the kernel's block and |
| // grid dimensions. |
| enum RowReduceDims { |
| ONE_WARP_PER_ROW, // gridDim.x == 1 && blockDim.x == warpSize |
| ONE_BLOCK_PER_ROW, // gridDim.x == 1 && blockDim.x > warpSize |
| MANY_BLOCKS_PER_ROW // gridDim.x > 1 |
| }; |
| |
| // Reduces along a matrix's rows to produce a column vector. |
| template <RowReduceDims Dims, bool MultipleRowsPerThread, typename Input, |
| typename Output, typename Reducer> |
| __global__ void RowReduceKernel(Reducer reducer, const Input input, int rows, |
| int cols, Output output) { |
| typedef typename Input::Index Index; |
| |
| // Keep this in sync with the constant in RowReduceCuda. |
| constexpr int ElemsPerRow = 8; |
| |
| // All threads in a warp must have the same threadIdx.y so they operate on the |
| // same row. |
| assert(blockDim.x % warpSize == 0); |
| // This is a 2D kernel. |
| assert(blockDim.z == 1); |
| assert(gridDim.z == 1); |
| |
| // Dims should be entirely determined by our block and grid dimensions. |
| if (blockDim.x == warpSize) { |
| assert(Dims == ONE_WARP_PER_ROW); |
| assert(gridDim.x == 1); |
| } else if (gridDim.x == 1) { |
| assert(Dims == ONE_BLOCK_PER_ROW); |
| } else { |
| assert(Dims == MANY_BLOCKS_PER_ROW); |
| } |
| |
| // The MultipleRowsPerThread template parameter should be entirely determined |
| // by our block/grid dims and the number of rows. |
| assert(MultipleRowsPerThread == (gridDim.y * blockDim.y < rows)); |
| |
| const Index first_row = blockIdx.y * blockDim.y + threadIdx.y; |
| const Index row_step = blockDim.y * gridDim.y; |
| #pragma nounroll |
| for (Index row = first_row; row < rows; row += row_step) { |
| // We have to be careful how we tell the compiler that we only make one trip |
| // through the loop if MultipleRowsPerThread is false: If Index is unsigned, |
| // the compiler won't assume that e.g. first_row + 1 does not overflow. But |
| // clang is smart enough to understand this. |
| if (!MultipleRowsPerThread && row != first_row) |
| break; |
| |
| float val = reducer.bottom_value(); |
| Index col_start = ElemsPerRow * (blockIdx.x * blockDim.x + threadIdx.x); |
| if (col_start + ElemsPerRow - 1 < cols) { |
| // Manually unrolling this loop and doing a tree reduction instead of a |
| // straight-line reduction doesn't have a noticable performance impact on |
| // Kepler. |
| #pragma unroll 8 |
| for (Index col = col_start; col < col_start + ElemsPerRow; ++col) { |
| val = reducer(input.coeff(row * cols + col), val); |
| } |
| } |
| else { |
| #pragma nounroll |
| for (Index col = col_start; col < cols; ++col) { |
| val = reducer(input.coeff(row * cols + col), val); |
| } |
| } |
| |
| val = reducer(__shfl_down(val, 16), val); |
| val = reducer(__shfl_down(val, 8), val); |
| val = reducer(__shfl_down(val, 4), val); |
| val = reducer(__shfl_down(val, 2), val); |
| val = reducer(__shfl_down(val, 1), val); |
| |
| const int warp_id = threadIdx.x & (warpSize - 1); |
| |
| // At this point, thread 0 in each warp has the reduced value for all threads |
| // in the warp. Time for us to write that value into output. |
| // |
| // If ONE_WARP_PER_ROW, only one warp wants to write to output[row], so we |
| // just write directly into output. If ONE_BLOCK_PER_ROW, we initialize |
| // output and then atomically reduce into it. And if MANY_BLOCKS_PER_ROW, we |
| // just do the atomic reduction -- our output was already initialized before |
| // this kernel started. |
| // |
| // (We could use shared memory to do the ONE_BLOCK_PER_ROW reduction, but the |
| // proper atomic reduction seems to be faster, at least on Kepler.) |
| if (Dims == ONE_WARP_PER_ROW) { |
| if (warp_id == 0) output.coeffRef(row) = val; |
| } else { |
| if (Dims == ONE_BLOCK_PER_ROW) { |
| output.coeffRef(row) = reducer.bottom_value(); |
| __syncthreads(); |
| } |
| if (warp_id == 0) { |
| #if __CUDA_ARCH__ >= 300 |
| reducer.atomic_reduce(&output.coeffRef(row), val); |
| #else |
| assert(false && "This kernel requires sm_30 or higher."); |
| #endif |
| } |
| } |
| } |
| } |
| |
| template <RowReduceDims Dims, typename Input, typename Output, typename Reducer> |
| void RowReduceCudaLaunchHelper(dim3 grid_size, dim3 block_size, Reducer reducer, |
| const GpuDevice& device, const Input input, |
| int rows, int cols, Output output) { |
| if (rows <= grid_size.y * block_size.y) { |
| LAUNCH_CUDA_KERNEL( |
| (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ false, Input, |
| Output, Reducer>), |
| grid_size, block_size, 0, device, reducer, input, rows, cols, output); |
| } else { |
| LAUNCH_CUDA_KERNEL( |
| (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ true, Input, |
| Output, Reducer>), |
| grid_size, block_size, 0, device, reducer, input, rows, cols, output); |
| } |
| } |
| |
| template <typename Input, typename Output, typename Reducer> |
| void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input, |
| int rows, int cols, Output output) { |
| // Each thread in the kernel processes ElemsPerRow elements per row. This |
| // value must be kept in sync with the constant inside RowReduceKernel. |
| constexpr int ElemsPerRow = 8; |
| |
| // Maximum width and height of our grid of thread blocks. (sm_30 has a |
| // smaller limit on the x dimension, but this kernel requires sm_30+.) |
| constexpr uint32 MaxGridXDim = (static_cast<uint32>(1) << 31) - 1; |
| constexpr int32 MaxGridYDim = (static_cast<uint32>(1) << 16) - 1; |
| |
| // The maximum number of warps we'll put in a thread block. |
| constexpr int MaxWarpsPerBlock = 4; |
| |
| constexpr int warp_size = 32; // gcudacc doesn't define warpSize on the host. |
| |
| // We choose block_size.x so one block contains a whole row, if possible. If |
| // each row is processed by a single block, then we can avoid launching an |
| // InitVector kernel and instead initialize our output within the reduction |
| // kernel. If each row is processed by a single *warp*, we can further avoid |
| // doing an inter-warp reduction. |
| const int block_x_warps = |
| min(MaxWarpsPerBlock, divup(cols, ElemsPerRow * warp_size)); |
| const int block_y = MaxWarpsPerBlock / block_x_warps; |
| assert(block_y > 0); |
| const dim3 block_size = dim3(block_x_warps * warp_size, block_y, 1); |
| |
| // TODO(jlebar): Consider swapping the meaning of our block indices, such that |
| // the block at (x, y) actually processes the data at (y, x). |
| // |
| // Right now very large inputs cause us to process multiple rows per thread, |
| // but one thread never processes process more than ElemsPerRow elements in a |
| // particular row. Given the option, we would rather reverse this and let a |
| // process touch more elements in just one row, because we have to do an |
| // intra- and possibly inter-warp reduction for each row a thread touches. |
| // |
| // Since the grid's max x dimension is much larger than its max y dimension, |
| // letting the x dimension correspond to our row index would let us size our |
| // grid so that one thread never has to process more than one row. But at the |
| // moment this doesn't seem to matter because nobody seems to be reducing huge |
| // inputs. |
| // |
| // Note that if we do this, we could only switch the meaning of our *block* |
| // indices. The thread indices would have to remain as-is, in order to ensure |
| // that all threads in a warp touch the same row. |
| |
| const int grid_x = divup<int>(cols, ElemsPerRow * block_size.x); |
| // grid_x > MaxGridXDim implies that we have an input with at least |
| // WarpSize * MaxWarpsPerBlock * ElemsPerRow * MaxGridXDim = 2^41 - 1 |
| // elements. Until we have GPUs with 2TB of memory, we don't need to worry |
| // about this. |
| assert(grid_x <= MaxGridXDim && "Unsupported giant input."); |
| |
| const int orig_grid_y = divup<int>(rows, block_size.y); |
| int grid_y = numext::mini(orig_grid_y, MaxGridYDim); |
| |
| const dim3 grid_size(grid_x, grid_y, 1); |
| |
| if (block_size.x == warp_size) { |
| RowReduceCudaLaunchHelper<ONE_WARP_PER_ROW>( |
| grid_size, block_size, reducer, device, input, rows, cols, output); |
| } else if (grid_size.x == 1) { |
| RowReduceCudaLaunchHelper<ONE_BLOCK_PER_ROW>( |
| grid_size, block_size, reducer, device, input, rows, cols, output); |
| } else { |
| // We only need to initialize our output if multiple blocks operate on the |
| // same row. Otherwise, the reduction kernel will handle the initialization |
| // itself. |
| LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device, |
| reducer.bottom_value(), rows, output); |
| RowReduceCudaLaunchHelper<MANY_BLOCKS_PER_ROW>( |
| grid_size, block_size, reducer, device, input, rows, cols, output); |
| } |
| } |
| |
| // Provides arbitrary sum reductions, applying a function across the |
| // right argument being reduced prior to summing |
| template <typename F> |
| struct FnSumReducer { |
| __host__ __device__ FnSumReducer(F f) : f_(f) {} |
| __host__ __device__ float bottom_value() { return 0.0f; } |
| __device__ float operator()(float x, float y) const { return x + f_(y); } |
| __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); } |
| |
| F f_; |
| }; |
| |
| // Identity is used for the basic SumReduction |
| struct Identity { |
| __device__ float operator()(float x) const { return x; } |
| }; |
| |
| struct CudaSumReducer : FnSumReducer<Identity> { |
| __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {} |
| }; |
| |
| struct CudaMaxReducer { |
| // nvcc doesn't recognize numeric_limits<float>::lowest for some reason. |
| CudaMaxReducer() { |
| bottom_value_ = -3.40282347E+38F; // std::numeric_limits<float>::lowest(); |
| } |
| __host__ __device__ float bottom_value() { return bottom_value_; } |
| __device__ float operator()(float x, float y) const { return fmax(x, y); } |
| |
| // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for |
| // float data type. Instead, this atomically compares-and-swaps the old value |
| // at x with y. If the old value returned by the CAS operation was already |
| // larger than y, or what was read before, it declares success and finishes, |
| // otherwise repeats the procedure. |
| __device__ void atomic_reduce(float* x, float y) { |
| unsigned int old_val = *reinterpret_cast<unsigned int*>(x); |
| while (*reinterpret_cast<float*>(&old_val) < y) { |
| unsigned int current_val = |
| atomicCAS(reinterpret_cast<unsigned int*>(x), old_val, |
| *reinterpret_cast<unsigned int*>(&y)); |
| if (old_val == current_val) { |
| break; |
| } |
| old_val = current_val; |
| } |
| } |
| float bottom_value_; |
| }; |
| |
| } // end namespace |
| |
| template <typename Op> |
| struct IsFloatSumReduction { |
| static const bool value = false; |
| }; |
| |
| template <> |
| struct IsFloatSumReduction<SumReducer<float> > { |
| static const bool value = true; |
| }; |
| |
| template <typename Op> |
| struct IsFloatMaxReduction { |
| static const bool value = false; |
| }; |
| |
| template <> |
| struct IsFloatMaxReduction<MaxReducer<float> > { |
| static const bool value = true; |
| }; |
| |
| template <typename Op> |
| struct SumOrMaxOfFloat { |
| static const bool value = |
| IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; |
| }; |
| |
| enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED }; |
| |
| template <typename Op, typename Expr, typename ReductionExpr> |
| ReductionType GetReductionType(const Expr& expr, |
| const ReductionExpr& reduction_expr, |
| const GpuDevice& device, std::size_t* rows, |
| std::size_t* cols) { |
| typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr; |
| typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr; |
| |
| if (device.majorDeviceVersion() < 3) { |
| return UNOPTIMIZED; |
| } |
| const EvalExpr eval_expr(expr, device); |
| |
| // We only have fast reductions for sum/max of float. |
| if (!SumOrMaxOfFloat<Op>::value) { |
| return UNOPTIMIZED; |
| } |
| |
| if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) { |
| return UNOPTIMIZED; |
| } |
| |
| if (ReductionEvalExpr::NumReducedDims > 1) { |
| return UNOPTIMIZED; |
| } |
| |
| const std::size_t total_size = array_prod(eval_expr.dimensions()); |
| if (total_size == 0) { |
| return UNOPTIMIZED; |
| } |
| |
| const int dim = reduction_expr.dims()[0]; |
| if (static_cast<int>(ReductionEvalExpr::Layout) == |
| static_cast<int>(RowMajor)) { |
| if (dim == ReductionEvalExpr::NumInputDims - 1) { |
| *rows = total_size / |
| eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; |
| *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; |
| if (*cols < 32) return UNOPTIMIZED; |
| return ROW_REDUCE; |
| // return UNOPTIMIZED; |
| } else if (dim == 0) { |
| *rows = eval_expr.dimensions()[0]; |
| *cols = total_size / eval_expr.dimensions()[0]; |
| if (*rows < 32) return UNOPTIMIZED; |
| return COL_REDUCE; |
| //return UNOPTIMIZED; |
| } |
| } else if (static_cast<int>(ReductionEvalExpr::Layout) == |
| static_cast<int>(ColMajor)) { |
| if (dim == ReductionEvalExpr::NumInputDims - 1) { |
| *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; |
| *cols = total_size / |
| eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1]; |
| if (*rows < 32) return UNOPTIMIZED; |
| return COL_REDUCE; |
| //return UNOPTIMIZED; |
| } else if (dim == 0) { |
| *rows = total_size / eval_expr.dimensions()[0]; |
| *cols = eval_expr.dimensions()[0]; |
| if (*cols < 32) return UNOPTIMIZED; |
| return ROW_REDUCE; |
| //return UNOPTIMIZED; |
| } |
| } |
| return UNOPTIMIZED; |
| } |
| |
| template <typename Expression, typename Index, bool Vectorizable> |
| struct LaunchKernel; |
| |
| template <typename Expression, typename Index> |
| struct LaunchKernel<Expression, Index, true> { |
| static void launch(int num_blocks, int block_size, const GpuDevice& device, |
| const TensorEvaluator<Expression, GpuDevice>& evaluator, |
| Index size) { |
| LAUNCH_CUDA_KERNEL( |
| (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), |
| num_blocks, block_size, 0, device, evaluator, size); |
| } |
| }; |
| |
| template <typename Expression, typename Index> |
| struct LaunchKernel<Expression, Index, false> { |
| static void launch(int num_blocks, int block_size, const GpuDevice& device, |
| const TensorEvaluator<Expression, GpuDevice>& evaluator, |
| Index size) { |
| LAUNCH_CUDA_KERNEL( |
| (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), |
| num_blocks, block_size, 0, device, evaluator, size); |
| } |
| }; |
| |
| template <typename F, typename LHS, typename RHS, bool Compatible> |
| struct LaunchRowReduce; |
| |
| template <typename F, typename LHS, typename RHS> |
| struct LaunchRowReduce<F, LHS, RHS, true> { |
| static void launch(const GpuDevice& device, RHS input, std::size_t rows, |
| std::size_t cols, LHS output) { |
| RowReduceCuda(F(), device, input, rows, cols, output); |
| } |
| }; |
| |
| template <typename F, typename LHS, typename RHS> |
| struct LaunchRowReduce<F, LHS, RHS, false> { |
| static void launch(const GpuDevice& device, RHS input, std::size_t rows, |
| std::size_t cols, LHS output) {} |
| }; |
| |
| template <typename F, typename LHS, typename RHS, bool Compatible> |
| struct LaunchColReduce; |
| |
| template <typename F, typename LHS, typename RHS> |
| struct LaunchColReduce<F, LHS, RHS, true> { |
| static void launch(const GpuDevice& device, RHS input, std::size_t rows, |
| std::size_t cols, LHS output) { |
| ColumnReduceCuda(F(), device, input, rows, cols, output); |
| } |
| }; |
| |
| template <typename F, typename LHS, typename RHS> |
| struct LaunchColReduce<F, LHS, RHS, false> { |
| static void launch(const GpuDevice& device, RHS input, std::size_t rows, |
| std::size_t cols, LHS output) {} |
| }; |
| |
| template <typename Expression, typename Device, bool Vectorizable> |
| class TensorAssignExecutorHelper; |
| |
| template <typename OutExpr, typename InExpr, typename Op, typename Indices, |
| bool Vectorizable> |
| class TensorAssignExecutorHelper< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, Vectorizable> { |
| public: |
| typedef const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const> |
| Expression; |
| |
| typedef typename Expression::Index Index; |
| typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval; |
| typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; |
| static inline void run(const Expression& expr, const GpuDevice& device) { |
| std::size_t rows, cols; |
| const ReductionType reduction_type = |
| GetReductionType<Op>(expr.rhsExpression().expression(), |
| expr.rhsExpression(), device, &rows, &cols); |
| if (reduction_type == UNOPTIMIZED) { |
| TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) { |
| const int num_blocks = device.getNumCudaMultiProcessors() * |
| device.maxCudaThreadsPerMultiProcessor() / |
| device.maxCudaThreadsPerBlock(); |
| const int block_size = device.maxCudaThreadsPerBlock(); |
| const Index size = array_prod(evaluator.dimensions()); |
| LaunchKernel<Expression, Index, Vectorizable>::launch( |
| num_blocks, block_size, device, evaluator, size); |
| } |
| evaluator.cleanup(); |
| } else { |
| LHSEval output(expr.lhsExpression(), device); |
| RHSEval input(expr.rhsExpression().expression(), device); |
| bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL); |
| bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); |
| if (lhs_needs_assign && rhs_needs_assign) { |
| const bool Compatible = |
| IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; |
| if (reduction_type == ROW_REDUCE) { |
| if (IsFloatSumReduction<Op>::value) { |
| LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else if (IsFloatMaxReduction<Op>::value) { |
| LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else { |
| // Unsupported reduction type |
| assert(false && "Unsupported reduction function for ROW_REDUCE"); |
| } |
| } else { |
| if (IsFloatSumReduction<Op>::value) { |
| LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else if (IsFloatMaxReduction<Op>::value) { |
| LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else { |
| // Unsupported reduction type |
| assert(false && "Unsupported reduction function for COL_REDUCE"); |
| } |
| } |
| } |
| input.cleanup(); |
| output.cleanup(); |
| } |
| } |
| }; |
| |
| template <typename OutExpr, typename InExpr, typename Op, typename Indices, |
| bool Tileable> |
| inline void TensorExecutor< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, false, Tileable>::run(const Expression& expr, |
| const GpuDevice& device) { |
| TensorAssignExecutorHelper< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, false>::run(expr, device); |
| } |
| |
| template <typename OutExpr, typename InExpr, typename Op, typename Indices, |
| bool Tileable> |
| inline void TensorExecutor< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, true, Tileable>::run(const Expression& expr, |
| const GpuDevice& device) { |
| TensorAssignExecutorHelper< |
| const TensorAssignOp< |
| OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>, |
| GpuDevice, true>::run(expr, device); |
| } |
| |
| template <typename T, typename Index> |
| struct PtrWrapper { |
| EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {} |
| EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); } |
| T* m_ptr; |
| }; |
| |
| template <typename Expression, typename Device, bool Vectorizable> |
| class TensorEvalToExecutorHelper; |
| |
| template <typename InExpr, typename Op, typename Indices, bool Vectorizable> |
| class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> >, |
| GpuDevice, Vectorizable> { |
| public: |
| typedef const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> > Expression; |
| typedef typename Expression::Index Index; |
| typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; |
| |
| static inline void run(const Expression& expr, const GpuDevice& device) { |
| std::size_t rows, cols; |
| const ReductionType reduction_type = |
| GetReductionType<Op>(expr.expression().expression(), expr.expression(), |
| device, &rows, &cols); |
| if (reduction_type == UNOPTIMIZED) { |
| TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) { |
| const int num_blocks = device.getNumCudaMultiProcessors() * |
| device.maxCudaThreadsPerMultiProcessor() / |
| device.maxCudaThreadsPerBlock(); |
| const int block_size = device.maxCudaThreadsPerBlock(); |
| const Index size = array_prod(evaluator.dimensions()); |
| LaunchKernel<Expression, Index, Vectorizable>::launch( |
| num_blocks, block_size, device, evaluator, size); |
| } |
| evaluator.cleanup(); |
| } else { |
| typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar; |
| PtrWrapper<Scalar, Index> output(expr.buffer()); |
| TensorEvaluator<const InExpr, GpuDevice> input( |
| expr.expression().expression(), device); |
| typedef PtrWrapper<Scalar, Index> LHSEval; |
| typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval; |
| bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL); |
| if (rhs_needs_assign) { |
| const bool Compatible = |
| IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value; |
| if (reduction_type == ROW_REDUCE) { |
| if (IsFloatSumReduction<Op>::value) { |
| LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else if (IsFloatMaxReduction<Op>::value) { |
| LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } |
| } else { |
| if (IsFloatSumReduction<Op>::value) { |
| LaunchColReduce<CudaSumReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } else if (IsFloatMaxReduction<Op>::value) { |
| LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval, |
| Compatible>::launch(device, input, rows, cols, |
| output); |
| } |
| } |
| } |
| input.cleanup(); |
| } |
| } |
| }; |
| |
| template <typename InExpr, typename Op, typename Indices, bool Tileable> |
| inline void |
| TensorExecutor<const TensorEvalToOp< |
| const TensorReductionOp<Op, const Indices, const InExpr> >, |
| GpuDevice, false, Tileable>::run(const Expression& expr, |
| const GpuDevice& device) { |
| TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> >, |
| GpuDevice, false>::run(expr, device); |
| } |
| |
| template <typename InExpr, typename Op, typename Indices, bool Tileable> |
| inline void |
| TensorExecutor<const TensorEvalToOp< |
| const TensorReductionOp<Op, const Indices, const InExpr> >, |
| GpuDevice, true, Tileable>::run(const Expression& expr, |
| const GpuDevice& device) { |
| TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp< |
| Op, const Indices, const InExpr> >, |
| GpuDevice, true>::run(expr, device); |
| } |
| |
| } // end namespace internal |
| |
| } // end namespace Eigen |
| |
| #endif // __CUDACC__ |
| #endif // EIGEN_USE_GPU |
| #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H |