blob: 0b3bc7cf198cbaa0f53b42fc6b7f9517a69e8b78 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
#if defined(EIGEN_USE_GPU)
namespace Eigen {
namespace internal {
template <typename OutExpr, typename InExpr, typename Op, typename Indices,
bool Tileable>
class TensorExecutor<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, false, Tileable> {
public:
typedef const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
Expression;
static void run(const Expression& expr, const GpuDevice& device);
};
template <typename OutExpr, typename InExpr, typename Op, typename Indices,
bool Tileable>
class TensorExecutor<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, true, Tileable> {
public:
typedef const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
Expression;
static void run(const Expression& expr, const GpuDevice& device);
};
template <typename InExpr, typename Op, typename Indices, bool Tileable>
class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> >,
GpuDevice, false, Tileable> {
public:
typedef const TensorEvalToOp<
const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
static void run(const Expression& expr, const GpuDevice& device);
};
template <typename InExpr, typename Op, typename Indices, bool Tileable>
class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> >,
GpuDevice, true, Tileable> {
public:
typedef const TensorEvalToOp<
const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
static void run(const Expression& expr, const GpuDevice& device);
};
} // end namespace internal
} // end namespace Eigen
#if defined(__CUDACC__)
namespace Eigen {
namespace internal {
namespace {
// Initialize output[0..size-1] with val
template <typename Output>
__global__ void InitVector(const float val, int size, Output output) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < size; i += gridDim.x * blockDim.x) {
output.coeffRef(i) = val;
}
}
// -----------------------------------------------------------------------------
// Column Reduction kernels
// -----------------------------------------------------------------------------
template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input,
typename Output, typename Reducer>
__global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows,
int cols, Output output) {
assert(blockDim.x == BLOCK_DIM);
assert(blockDim.y == 1);
assert(blockDim.z == 1);
assert(gridDim.x == GRID_DIM);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
typedef typename Input::Index Index;
const Index num_input_points = divup(rows, NUM_PER_THREAD) * cols;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
for (Index i = bx * BLOCK_DIM + tx; i < num_input_points;
i += BLOCK_DIM * GRID_DIM) {
const Index input_col = i % cols;
const Index input_row_begin =
((i / cols) % divup(rows, NUM_PER_THREAD)) * NUM_PER_THREAD;
float reduced_val = reducer.bottom_value();
for (int j = 0; j < NUM_PER_THREAD; ++j) {
float val = ((input_col < cols) && (input_row_begin + j < rows))
? input.coeff((input_row_begin + j) * cols + input_col)
: reducer.bottom_value();
reduced_val = reducer(reduced_val, val);
}
#if __CUDA_ARCH__ >= 300
reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val);
#endif
}
}
template <typename Input, typename Output, typename Reducer>
void ColumnReduceCuda(Reducer reducer, const GpuDevice& device,
const Input input, int rows, int cols, Output output) {
const int block_size = 256;
const int grid_size = 128;
const int num_per_thread = 16;
LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device, reducer.bottom_value(),
cols, output);
LAUNCH_CUDA_KERNEL(
(ColumnReduceKernel<grid_size, block_size, num_per_thread, Input, Output, Reducer>), grid_size,
block_size, 0, device, reducer, input, rows, cols, output);
}
// -----------------------------------------------------------------------------
// Row Reduction kernel
// -----------------------------------------------------------------------------
// Template parameter to RowReduceKernel derived from the kernel's block and
// grid dimensions.
enum RowReduceDims {
ONE_WARP_PER_ROW, // gridDim.x == 1 && blockDim.x == warpSize
ONE_BLOCK_PER_ROW, // gridDim.x == 1 && blockDim.x > warpSize
MANY_BLOCKS_PER_ROW // gridDim.x > 1
};
// Reduces along a matrix's rows to produce a column vector.
template <RowReduceDims Dims, bool MultipleRowsPerThread, typename Input,
typename Output, typename Reducer>
__global__ void RowReduceKernel(Reducer reducer, const Input input, int rows,
int cols, Output output) {
typedef typename Input::Index Index;
// Keep this in sync with the constant in RowReduceCuda.
constexpr int ElemsPerRow = 8;
// All threads in a warp must have the same threadIdx.y so they operate on the
// same row.
assert(blockDim.x % warpSize == 0);
// This is a 2D kernel.
assert(blockDim.z == 1);
assert(gridDim.z == 1);
// Dims should be entirely determined by our block and grid dimensions.
if (blockDim.x == warpSize) {
assert(Dims == ONE_WARP_PER_ROW);
assert(gridDim.x == 1);
} else if (gridDim.x == 1) {
assert(Dims == ONE_BLOCK_PER_ROW);
} else {
assert(Dims == MANY_BLOCKS_PER_ROW);
}
// The MultipleRowsPerThread template parameter should be entirely determined
// by our block/grid dims and the number of rows.
assert(MultipleRowsPerThread == (gridDim.y * blockDim.y < rows));
const Index first_row = blockIdx.y * blockDim.y + threadIdx.y;
const Index row_step = blockDim.y * gridDim.y;
#pragma nounroll
for (Index row = first_row; row < rows; row += row_step) {
// We have to be careful how we tell the compiler that we only make one trip
// through the loop if MultipleRowsPerThread is false: If Index is unsigned,
// the compiler won't assume that e.g. first_row + 1 does not overflow. But
// clang is smart enough to understand this.
if (!MultipleRowsPerThread && row != first_row)
break;
float val = reducer.bottom_value();
Index col_start = ElemsPerRow * (blockIdx.x * blockDim.x + threadIdx.x);
if (col_start + ElemsPerRow - 1 < cols) {
// Manually unrolling this loop and doing a tree reduction instead of a
// straight-line reduction doesn't have a noticable performance impact on
// Kepler.
#pragma unroll 8
for (Index col = col_start; col < col_start + ElemsPerRow; ++col) {
val = reducer(input.coeff(row * cols + col), val);
}
}
else {
#pragma nounroll
for (Index col = col_start; col < cols; ++col) {
val = reducer(input.coeff(row * cols + col), val);
}
}
val = reducer(__shfl_down(val, 16), val);
val = reducer(__shfl_down(val, 8), val);
val = reducer(__shfl_down(val, 4), val);
val = reducer(__shfl_down(val, 2), val);
val = reducer(__shfl_down(val, 1), val);
const int warp_id = threadIdx.x & (warpSize - 1);
// At this point, thread 0 in each warp has the reduced value for all threads
// in the warp. Time for us to write that value into output.
//
// If ONE_WARP_PER_ROW, only one warp wants to write to output[row], so we
// just write directly into output. If ONE_BLOCK_PER_ROW, we initialize
// output and then atomically reduce into it. And if MANY_BLOCKS_PER_ROW, we
// just do the atomic reduction -- our output was already initialized before
// this kernel started.
//
// (We could use shared memory to do the ONE_BLOCK_PER_ROW reduction, but the
// proper atomic reduction seems to be faster, at least on Kepler.)
if (Dims == ONE_WARP_PER_ROW) {
if (warp_id == 0) output.coeffRef(row) = val;
} else {
if (Dims == ONE_BLOCK_PER_ROW) {
output.coeffRef(row) = reducer.bottom_value();
__syncthreads();
}
if (warp_id == 0) {
#if __CUDA_ARCH__ >= 300
reducer.atomic_reduce(&output.coeffRef(row), val);
#else
assert(false && "This kernel requires sm_30 or higher.");
#endif
}
}
}
}
template <RowReduceDims Dims, typename Input, typename Output, typename Reducer>
void RowReduceCudaLaunchHelper(dim3 grid_size, dim3 block_size, Reducer reducer,
const GpuDevice& device, const Input input,
int rows, int cols, Output output) {
if (rows <= grid_size.y * block_size.y) {
LAUNCH_CUDA_KERNEL(
(RowReduceKernel<Dims, /* MultipleRowsPerThread = */ false, Input,
Output, Reducer>),
grid_size, block_size, 0, device, reducer, input, rows, cols, output);
} else {
LAUNCH_CUDA_KERNEL(
(RowReduceKernel<Dims, /* MultipleRowsPerThread = */ true, Input,
Output, Reducer>),
grid_size, block_size, 0, device, reducer, input, rows, cols, output);
}
}
template <typename Input, typename Output, typename Reducer>
void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input,
int rows, int cols, Output output) {
// Each thread in the kernel processes ElemsPerRow elements per row. This
// value must be kept in sync with the constant inside RowReduceKernel.
constexpr int ElemsPerRow = 8;
// Maximum width and height of our grid of thread blocks. (sm_30 has a
// smaller limit on the x dimension, but this kernel requires sm_30+.)
constexpr uint32 MaxGridXDim = (static_cast<uint32>(1) << 31) - 1;
constexpr int32 MaxGridYDim = (static_cast<uint32>(1) << 16) - 1;
// The maximum number of warps we'll put in a thread block.
constexpr int MaxWarpsPerBlock = 4;
constexpr int warp_size = 32; // gcudacc doesn't define warpSize on the host.
// We choose block_size.x so one block contains a whole row, if possible. If
// each row is processed by a single block, then we can avoid launching an
// InitVector kernel and instead initialize our output within the reduction
// kernel. If each row is processed by a single *warp*, we can further avoid
// doing an inter-warp reduction.
const int block_x_warps =
min(MaxWarpsPerBlock, divup(cols, ElemsPerRow * warp_size));
const int block_y = MaxWarpsPerBlock / block_x_warps;
assert(block_y > 0);
const dim3 block_size = dim3(block_x_warps * warp_size, block_y, 1);
// TODO(jlebar): Consider swapping the meaning of our block indices, such that
// the block at (x, y) actually processes the data at (y, x).
//
// Right now very large inputs cause us to process multiple rows per thread,
// but one thread never processes process more than ElemsPerRow elements in a
// particular row. Given the option, we would rather reverse this and let a
// process touch more elements in just one row, because we have to do an
// intra- and possibly inter-warp reduction for each row a thread touches.
//
// Since the grid's max x dimension is much larger than its max y dimension,
// letting the x dimension correspond to our row index would let us size our
// grid so that one thread never has to process more than one row. But at the
// moment this doesn't seem to matter because nobody seems to be reducing huge
// inputs.
//
// Note that if we do this, we could only switch the meaning of our *block*
// indices. The thread indices would have to remain as-is, in order to ensure
// that all threads in a warp touch the same row.
const int grid_x = divup<int>(cols, ElemsPerRow * block_size.x);
// grid_x > MaxGridXDim implies that we have an input with at least
// WarpSize * MaxWarpsPerBlock * ElemsPerRow * MaxGridXDim = 2^41 - 1
// elements. Until we have GPUs with 2TB of memory, we don't need to worry
// about this.
assert(grid_x <= MaxGridXDim && "Unsupported giant input.");
const int orig_grid_y = divup<int>(rows, block_size.y);
int grid_y = numext::mini(orig_grid_y, MaxGridYDim);
const dim3 grid_size(grid_x, grid_y, 1);
if (block_size.x == warp_size) {
RowReduceCudaLaunchHelper<ONE_WARP_PER_ROW>(
grid_size, block_size, reducer, device, input, rows, cols, output);
} else if (grid_size.x == 1) {
RowReduceCudaLaunchHelper<ONE_BLOCK_PER_ROW>(
grid_size, block_size, reducer, device, input, rows, cols, output);
} else {
// We only need to initialize our output if multiple blocks operate on the
// same row. Otherwise, the reduction kernel will handle the initialization
// itself.
LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device,
reducer.bottom_value(), rows, output);
RowReduceCudaLaunchHelper<MANY_BLOCKS_PER_ROW>(
grid_size, block_size, reducer, device, input, rows, cols, output);
}
}
// Provides arbitrary sum reductions, applying a function across the
// right argument being reduced prior to summing
template <typename F>
struct FnSumReducer {
__host__ __device__ FnSumReducer(F f) : f_(f) {}
__host__ __device__ float bottom_value() { return 0.0f; }
__device__ float operator()(float x, float y) const { return x + f_(y); }
__device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); }
F f_;
};
// Identity is used for the basic SumReduction
struct Identity {
__device__ float operator()(float x) const { return x; }
};
struct CudaSumReducer : FnSumReducer<Identity> {
__host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {}
};
struct CudaMaxReducer {
// nvcc doesn't recognize numeric_limits<float>::lowest for some reason.
CudaMaxReducer() {
bottom_value_ = -3.40282347E+38F; // std::numeric_limits<float>::lowest();
}
__host__ __device__ float bottom_value() { return bottom_value_; }
__device__ float operator()(float x, float y) const { return fmax(x, y); }
// This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for
// float data type. Instead, this atomically compares-and-swaps the old value
// at x with y. If the old value returned by the CAS operation was already
// larger than y, or what was read before, it declares success and finishes,
// otherwise repeats the procedure.
__device__ void atomic_reduce(float* x, float y) {
unsigned int old_val = *reinterpret_cast<unsigned int*>(x);
while (*reinterpret_cast<float*>(&old_val) < y) {
unsigned int current_val =
atomicCAS(reinterpret_cast<unsigned int*>(x), old_val,
*reinterpret_cast<unsigned int*>(&y));
if (old_val == current_val) {
break;
}
old_val = current_val;
}
}
float bottom_value_;
};
} // end namespace
template <typename Op>
struct IsFloatSumReduction {
static const bool value = false;
};
template <>
struct IsFloatSumReduction<SumReducer<float> > {
static const bool value = true;
};
template <typename Op>
struct IsFloatMaxReduction {
static const bool value = false;
};
template <>
struct IsFloatMaxReduction<MaxReducer<float> > {
static const bool value = true;
};
template <typename Op>
struct SumOrMaxOfFloat {
static const bool value =
IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
};
enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED };
template <typename Op, typename Expr, typename ReductionExpr>
ReductionType GetReductionType(const Expr& expr,
const ReductionExpr& reduction_expr,
const GpuDevice& device, std::size_t* rows,
std::size_t* cols) {
typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr;
typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr;
if (device.majorDeviceVersion() < 3) {
return UNOPTIMIZED;
}
const EvalExpr eval_expr(expr, device);
// We only have fast reductions for sum/max of float.
if (!SumOrMaxOfFloat<Op>::value) {
return UNOPTIMIZED;
}
if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) {
return UNOPTIMIZED;
}
if (ReductionEvalExpr::NumReducedDims > 1) {
return UNOPTIMIZED;
}
const std::size_t total_size = array_prod(eval_expr.dimensions());
if (total_size == 0) {
return UNOPTIMIZED;
}
const int dim = reduction_expr.dims()[0];
if (static_cast<int>(ReductionEvalExpr::Layout) ==
static_cast<int>(RowMajor)) {
if (dim == ReductionEvalExpr::NumInputDims - 1) {
*rows = total_size /
eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
*cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
if (*cols < 32) return UNOPTIMIZED;
return ROW_REDUCE;
// return UNOPTIMIZED;
} else if (dim == 0) {
*rows = eval_expr.dimensions()[0];
*cols = total_size / eval_expr.dimensions()[0];
if (*rows < 32) return UNOPTIMIZED;
return COL_REDUCE;
//return UNOPTIMIZED;
}
} else if (static_cast<int>(ReductionEvalExpr::Layout) ==
static_cast<int>(ColMajor)) {
if (dim == ReductionEvalExpr::NumInputDims - 1) {
*rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
*cols = total_size /
eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
if (*rows < 32) return UNOPTIMIZED;
return COL_REDUCE;
//return UNOPTIMIZED;
} else if (dim == 0) {
*rows = total_size / eval_expr.dimensions()[0];
*cols = eval_expr.dimensions()[0];
if (*cols < 32) return UNOPTIMIZED;
return ROW_REDUCE;
//return UNOPTIMIZED;
}
}
return UNOPTIMIZED;
}
template <typename Expression, typename Index, bool Vectorizable>
struct LaunchKernel;
template <typename Expression, typename Index>
struct LaunchKernel<Expression, Index, true> {
static void launch(int num_blocks, int block_size, const GpuDevice& device,
const TensorEvaluator<Expression, GpuDevice>& evaluator,
Index size) {
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size);
}
};
template <typename Expression, typename Index>
struct LaunchKernel<Expression, Index, false> {
static void launch(int num_blocks, int block_size, const GpuDevice& device,
const TensorEvaluator<Expression, GpuDevice>& evaluator,
Index size) {
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size);
}
};
template <typename F, typename LHS, typename RHS, bool Compatible>
struct LaunchRowReduce;
template <typename F, typename LHS, typename RHS>
struct LaunchRowReduce<F, LHS, RHS, true> {
static void launch(const GpuDevice& device, RHS input, std::size_t rows,
std::size_t cols, LHS output) {
RowReduceCuda(F(), device, input, rows, cols, output);
}
};
template <typename F, typename LHS, typename RHS>
struct LaunchRowReduce<F, LHS, RHS, false> {
static void launch(const GpuDevice& device, RHS input, std::size_t rows,
std::size_t cols, LHS output) {}
};
template <typename F, typename LHS, typename RHS, bool Compatible>
struct LaunchColReduce;
template <typename F, typename LHS, typename RHS>
struct LaunchColReduce<F, LHS, RHS, true> {
static void launch(const GpuDevice& device, RHS input, std::size_t rows,
std::size_t cols, LHS output) {
ColumnReduceCuda(F(), device, input, rows, cols, output);
}
};
template <typename F, typename LHS, typename RHS>
struct LaunchColReduce<F, LHS, RHS, false> {
static void launch(const GpuDevice& device, RHS input, std::size_t rows,
std::size_t cols, LHS output) {}
};
template <typename Expression, typename Device, bool Vectorizable>
class TensorAssignExecutorHelper;
template <typename OutExpr, typename InExpr, typename Op, typename Indices,
bool Vectorizable>
class TensorAssignExecutorHelper<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, Vectorizable> {
public:
typedef const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
Expression;
typedef typename Expression::Index Index;
typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval;
typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
static inline void run(const Expression& expr, const GpuDevice& device) {
std::size_t rows, cols;
const ReductionType reduction_type =
GetReductionType<Op>(expr.rhsExpression().expression(),
expr.rhsExpression(), device, &rows, &cols);
if (reduction_type == UNOPTIMIZED) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const int num_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() /
device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
const Index size = array_prod(evaluator.dimensions());
LaunchKernel<Expression, Index, Vectorizable>::launch(
num_blocks, block_size, device, evaluator, size);
}
evaluator.cleanup();
} else {
LHSEval output(expr.lhsExpression(), device);
RHSEval input(expr.rhsExpression().expression(), device);
bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL);
bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
if (lhs_needs_assign && rhs_needs_assign) {
const bool Compatible =
IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
if (reduction_type == ROW_REDUCE) {
if (IsFloatSumReduction<Op>::value) {
LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else if (IsFloatMaxReduction<Op>::value) {
LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else {
// Unsupported reduction type
assert(false && "Unsupported reduction function for ROW_REDUCE");
}
} else {
if (IsFloatSumReduction<Op>::value) {
LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else if (IsFloatMaxReduction<Op>::value) {
LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else {
// Unsupported reduction type
assert(false && "Unsupported reduction function for COL_REDUCE");
}
}
}
input.cleanup();
output.cleanup();
}
}
};
template <typename OutExpr, typename InExpr, typename Op, typename Indices,
bool Tileable>
inline void TensorExecutor<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, false, Tileable>::run(const Expression& expr,
const GpuDevice& device) {
TensorAssignExecutorHelper<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, false>::run(expr, device);
}
template <typename OutExpr, typename InExpr, typename Op, typename Indices,
bool Tileable>
inline void TensorExecutor<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, true, Tileable>::run(const Expression& expr,
const GpuDevice& device) {
TensorAssignExecutorHelper<
const TensorAssignOp<
OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
GpuDevice, true>::run(expr, device);
}
template <typename T, typename Index>
struct PtrWrapper {
EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {}
EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); }
T* m_ptr;
};
template <typename Expression, typename Device, bool Vectorizable>
class TensorEvalToExecutorHelper;
template <typename InExpr, typename Op, typename Indices, bool Vectorizable>
class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> >,
GpuDevice, Vectorizable> {
public:
typedef const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> > Expression;
typedef typename Expression::Index Index;
typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
static inline void run(const Expression& expr, const GpuDevice& device) {
std::size_t rows, cols;
const ReductionType reduction_type =
GetReductionType<Op>(expr.expression().expression(), expr.expression(),
device, &rows, &cols);
if (reduction_type == UNOPTIMIZED) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const int num_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() /
device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
const Index size = array_prod(evaluator.dimensions());
LaunchKernel<Expression, Index, Vectorizable>::launch(
num_blocks, block_size, device, evaluator, size);
}
evaluator.cleanup();
} else {
typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar;
PtrWrapper<Scalar, Index> output(expr.buffer());
TensorEvaluator<const InExpr, GpuDevice> input(
expr.expression().expression(), device);
typedef PtrWrapper<Scalar, Index> LHSEval;
typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
if (rhs_needs_assign) {
const bool Compatible =
IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
if (reduction_type == ROW_REDUCE) {
if (IsFloatSumReduction<Op>::value) {
LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else if (IsFloatMaxReduction<Op>::value) {
LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
}
} else {
if (IsFloatSumReduction<Op>::value) {
LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
} else if (IsFloatMaxReduction<Op>::value) {
LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
Compatible>::launch(device, input, rows, cols,
output);
}
}
}
input.cleanup();
}
}
};
template <typename InExpr, typename Op, typename Indices, bool Tileable>
inline void
TensorExecutor<const TensorEvalToOp<
const TensorReductionOp<Op, const Indices, const InExpr> >,
GpuDevice, false, Tileable>::run(const Expression& expr,
const GpuDevice& device) {
TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> >,
GpuDevice, false>::run(expr, device);
}
template <typename InExpr, typename Op, typename Indices, bool Tileable>
inline void
TensorExecutor<const TensorEvalToOp<
const TensorReductionOp<Op, const Indices, const InExpr> >,
GpuDevice, true, Tileable>::run(const Expression& expr,
const GpuDevice& device) {
TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
Op, const Indices, const InExpr> >,
GpuDevice, true>::run(expr, device);
}
} // end namespace internal
} // end namespace Eigen
#endif // __CUDACC__
#endif // EIGEN_USE_GPU
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H