unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h - eigen/fuchsia - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2015 Manjunath Kudlur <keveman@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H

 #if defined(EIGEN_USE_GPU)

 namespace Eigen {
 namespace internal {

 template <typename OutExpr, typename InExpr, typename Op, typename Indices,
           bool Tileable>
 class TensorExecutor<
     const TensorAssignOp<
         OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
     GpuDevice, false, Tileable> {
  public:
   typedef const TensorAssignOp<
       OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
       Expression;
   static void run(const Expression& expr, const GpuDevice& device);
 };

 template <typename OutExpr, typename InExpr, typename Op, typename Indices,
           bool Tileable>
 class TensorExecutor<
     const TensorAssignOp<
         OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
     GpuDevice, true, Tileable> {
  public:
   typedef const TensorAssignOp<
       OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
       Expression;
   static void run(const Expression& expr, const GpuDevice& device);
 };

 template <typename InExpr, typename Op, typename Indices, bool Tileable>
 class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
                          Op, const Indices, const InExpr> >,
                      GpuDevice, false, Tileable> {
  public:
   typedef const TensorEvalToOp<
       const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
   static void run(const Expression& expr, const GpuDevice& device);
 };

 template <typename InExpr, typename Op, typename Indices, bool Tileable>
 class TensorExecutor<const TensorEvalToOp<const TensorReductionOp<
                          Op, const Indices, const InExpr> >,
                      GpuDevice, true, Tileable> {
  public:
   typedef const TensorEvalToOp<
       const TensorReductionOp<Op, const Indices, const InExpr> > Expression;
   static void run(const Expression& expr, const GpuDevice& device);
 };

 }  // end namespace internal
 }  // end namespace Eigen

 #if defined(__CUDACC__)

 namespace Eigen {

 namespace internal {

 namespace {

 // Initialize output[0..size-1] with val
 template <typename Output>
 __global__ void InitVector(const float val, int size, Output output) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = idx; i < size; i += gridDim.x * blockDim.x) {
     output.coeffRef(i) = val;
   }
 }

 // -----------------------------------------------------------------------------
 // Column Reduction kernels
 // -----------------------------------------------------------------------------
 template <int GRID_DIM, int BLOCK_DIM, int NUM_PER_THREAD, typename Input,
           typename Output, typename Reducer>
 __global__ void ColumnReduceKernel(Reducer reducer, const Input input, int rows,
                                    int cols, Output output) {
   assert(blockDim.x == BLOCK_DIM);
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);

   assert(gridDim.x == GRID_DIM);
   assert(gridDim.y == 1);
   assert(gridDim.z == 1);

   typedef typename Input::Index Index;

   const Index num_input_points = divup(rows, NUM_PER_THREAD) * cols;
   const int bx = blockIdx.x;
   const int tx = threadIdx.x;

   for (Index i = bx * BLOCK_DIM + tx; i < num_input_points;
        i += BLOCK_DIM * GRID_DIM) {
     const Index input_col = i % cols;
     const Index input_row_begin =
         ((i / cols) % divup(rows, NUM_PER_THREAD)) * NUM_PER_THREAD;
     float reduced_val = reducer.bottom_value();
     for (int j = 0; j < NUM_PER_THREAD; ++j) {
       float val = ((input_col < cols) && (input_row_begin + j < rows))
                       ? input.coeff((input_row_begin + j) * cols + input_col)
                       : reducer.bottom_value();
       reduced_val = reducer(reduced_val, val);
     }
 #if __CUDA_ARCH__ >= 300
     reducer.atomic_reduce(&output.coeffRef(input_col), reduced_val);
 #endif
   }
 }

 template <typename Input, typename Output, typename Reducer>
 void ColumnReduceCuda(Reducer reducer, const GpuDevice& device,
                       const Input input, int rows, int cols, Output output) {
   const int block_size = 256;
   const int grid_size = 128;
   const int num_per_thread = 16;
   LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device, reducer.bottom_value(),
                      cols, output);
   LAUNCH_CUDA_KERNEL(
       (ColumnReduceKernel<grid_size, block_size, num_per_thread, Input, Output, Reducer>), grid_size,
       block_size, 0, device, reducer, input, rows, cols, output);
 }

 // -----------------------------------------------------------------------------
 // Row Reduction kernel
 // -----------------------------------------------------------------------------

 // Template parameter to RowReduceKernel derived from the kernel's block and
 // grid dimensions.
 enum RowReduceDims {
   ONE_WARP_PER_ROW,    // gridDim.x == 1 && blockDim.x == warpSize
   ONE_BLOCK_PER_ROW,   // gridDim.x == 1 && blockDim.x > warpSize
   MANY_BLOCKS_PER_ROW  // gridDim.x > 1
 };

 // Reduces along a matrix's rows to produce a column vector.
 template <RowReduceDims Dims, bool MultipleRowsPerThread, typename Input,
           typename Output, typename Reducer>
 __global__ void RowReduceKernel(Reducer reducer, const Input input, int rows,
                                 int cols, Output output) {
   typedef typename Input::Index Index;

   // Keep this in sync with the constant in RowReduceCuda.
   constexpr int ElemsPerRow = 8;

   // All threads in a warp must have the same threadIdx.y so they operate on the
   // same row.
   assert(blockDim.x % warpSize == 0);
   // This is a 2D kernel.
   assert(blockDim.z == 1);
   assert(gridDim.z == 1);

   // Dims should be entirely determined by our block and grid dimensions.
   if (blockDim.x == warpSize) {
     assert(Dims == ONE_WARP_PER_ROW);
     assert(gridDim.x == 1);
   } else if (gridDim.x == 1) {
     assert(Dims == ONE_BLOCK_PER_ROW);
   } else {
     assert(Dims == MANY_BLOCKS_PER_ROW);
   }

   // The MultipleRowsPerThread template parameter should be entirely determined
   // by our block/grid dims and the number of rows.
   assert(MultipleRowsPerThread == (gridDim.y * blockDim.y < rows));

   const Index first_row = blockIdx.y * blockDim.y + threadIdx.y;
   const Index row_step = blockDim.y * gridDim.y;
 #pragma nounroll
   for (Index row = first_row; row < rows; row += row_step) {
     // We have to be careful how we tell the compiler that we only make one trip
     // through the loop if MultipleRowsPerThread is false: If Index is unsigned,
     // the compiler won't assume that e.g. first_row + 1 does not overflow.  But
     // clang is smart enough to understand this.
     if (!MultipleRowsPerThread && row != first_row)
       break;

     float val = reducer.bottom_value();
     Index col_start = ElemsPerRow * (blockIdx.x * blockDim.x + threadIdx.x);
     if (col_start + ElemsPerRow - 1 < cols) {
       // Manually unrolling this loop and doing a tree reduction instead of a
       // straight-line reduction doesn't have a noticable performance impact on
       // Kepler.
 #pragma unroll 8
       for (Index col = col_start; col < col_start + ElemsPerRow; ++col) {
         val = reducer(input.coeff(row * cols + col), val);
       }
     }
     else {
 #pragma nounroll
       for (Index col = col_start; col < cols; ++col) {
         val = reducer(input.coeff(row * cols + col), val);
       }
     }

     val = reducer(__shfl_down(val, 16), val);
     val = reducer(__shfl_down(val, 8), val);
     val = reducer(__shfl_down(val, 4), val);
     val = reducer(__shfl_down(val, 2), val);
     val = reducer(__shfl_down(val, 1), val);

     const int warp_id = threadIdx.x & (warpSize - 1);

     // At this point, thread 0 in each warp has the reduced value for all threads
     // in the warp.  Time for us to write that value into output.
     //
     // If ONE_WARP_PER_ROW, only one warp wants to write to output[row], so we
     // just write directly into output.  If ONE_BLOCK_PER_ROW, we initialize
     // output and then atomically reduce into it.  And if MANY_BLOCKS_PER_ROW, we
     // just do the atomic reduction -- our output was already initialized before
     // this kernel started.
     //
     // (We could use shared memory to do the ONE_BLOCK_PER_ROW reduction, but the
     // proper atomic reduction seems to be faster, at least on Kepler.)
     if (Dims == ONE_WARP_PER_ROW) {
       if (warp_id == 0) output.coeffRef(row) = val;
     } else {
       if (Dims == ONE_BLOCK_PER_ROW) {
         output.coeffRef(row) = reducer.bottom_value();
         __syncthreads();
       }
       if (warp_id == 0) {
 #if __CUDA_ARCH__ >= 300
         reducer.atomic_reduce(&output.coeffRef(row), val);
 #else
         assert(false && "This kernel requires sm_30 or higher.");
 #endif
       }
     }
   }
 }

 template <RowReduceDims Dims, typename Input, typename Output, typename Reducer>
 void RowReduceCudaLaunchHelper(dim3 grid_size, dim3 block_size, Reducer reducer,
                                const GpuDevice& device, const Input input,
                                int rows, int cols, Output output) {
   if (rows <= grid_size.y * block_size.y) {
     LAUNCH_CUDA_KERNEL(
         (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ false, Input,
                          Output, Reducer>),
         grid_size, block_size, 0, device, reducer, input, rows, cols, output);
   } else {
     LAUNCH_CUDA_KERNEL(
         (RowReduceKernel<Dims, /* MultipleRowsPerThread = */ true, Input,
                          Output, Reducer>),
         grid_size, block_size, 0, device, reducer, input, rows, cols, output);
   }
 }

 template <typename Input, typename Output, typename Reducer>
 void RowReduceCuda(Reducer reducer, const GpuDevice& device, const Input input,
                    int rows, int cols, Output output) {
   // Each thread in the kernel processes ElemsPerRow elements per row.  This
   // value must be kept in sync with the constant inside RowReduceKernel.
   constexpr int ElemsPerRow = 8;

   // Maximum width and height of our grid of thread blocks.  (sm_30 has a
   // smaller limit on the x dimension, but this kernel requires sm_30+.)
   constexpr uint32 MaxGridXDim = (static_cast<uint32>(1) << 31) - 1;
   constexpr int32 MaxGridYDim = (static_cast<uint32>(1) << 16) - 1;

   // The maximum number of warps we'll put in a thread block.
   constexpr int MaxWarpsPerBlock = 4;

   constexpr int warp_size = 32;  // gcudacc doesn't define warpSize on the host.

   // We choose block_size.x so one block contains a whole row, if possible.  If
   // each row is processed by a single block, then we can avoid launching an
   // InitVector kernel and instead initialize our output within the reduction
   // kernel.  If each row is processed by a single *warp*, we can further avoid
   // doing an inter-warp reduction.
   const int block_x_warps =
       min(MaxWarpsPerBlock, divup(cols, ElemsPerRow * warp_size));
   const int block_y = MaxWarpsPerBlock / block_x_warps;
   assert(block_y > 0);
   const dim3 block_size = dim3(block_x_warps * warp_size, block_y, 1);

   // TODO(jlebar): Consider swapping the meaning of our block indices, such that
   // the block at (x, y) actually processes the data at (y, x).
   //
   // Right now very large inputs cause us to process multiple rows per thread,
   // but one thread never processes process more than ElemsPerRow elements in a
   // particular row.  Given the option, we would rather reverse this and let a
   // process touch more elements in just one row, because we have to do an
   // intra- and possibly inter-warp reduction for each row a thread touches.
   //
   // Since the grid's max x dimension is much larger than its max y dimension,
   // letting the x dimension correspond to our row index would let us size our
   // grid so that one thread never has to process more than one row.  But at the
   // moment this doesn't seem to matter because nobody seems to be reducing huge
   // inputs.
   //
   // Note that if we do this, we could only switch the meaning of our *block*
   // indices.  The thread indices would have to remain as-is, in order to ensure
   // that all threads in a warp touch the same row.

   const int grid_x = divup<int>(cols, ElemsPerRow * block_size.x);
   // grid_x > MaxGridXDim implies that we have an input with at least
   //   WarpSize * MaxWarpsPerBlock * ElemsPerRow * MaxGridXDim = 2^41 - 1
   // elements.  Until we have GPUs with 2TB of memory, we don't need to worry
   // about this.
   assert(grid_x <= MaxGridXDim && "Unsupported giant input.");

   const int orig_grid_y = divup<int>(rows, block_size.y);
   int grid_y = numext::mini(orig_grid_y, MaxGridYDim);

   const dim3 grid_size(grid_x, grid_y, 1);

   if (block_size.x == warp_size) {
     RowReduceCudaLaunchHelper<ONE_WARP_PER_ROW>(
         grid_size, block_size, reducer, device, input, rows, cols, output);
   } else if (grid_size.x == 1) {
     RowReduceCudaLaunchHelper<ONE_BLOCK_PER_ROW>(
         grid_size, block_size, reducer, device, input, rows, cols, output);
   } else {
     // We only need to initialize our output if multiple blocks operate on the
     // same row.  Otherwise, the reduction kernel will handle the initialization
     // itself.
     LAUNCH_CUDA_KERNEL(InitVector<Output>, 32, 1024, 0, device,
                        reducer.bottom_value(), rows, output);
     RowReduceCudaLaunchHelper<MANY_BLOCKS_PER_ROW>(
         grid_size, block_size, reducer, device, input, rows, cols, output);
   }
 }

 // Provides arbitrary sum reductions, applying a function across the
 // right argument being reduced prior to summing
 template <typename F>
 struct FnSumReducer {
   __host__ __device__ FnSumReducer(F f) : f_(f) {}
   __host__ __device__ float bottom_value() { return 0.0f; }
   __device__ float operator()(float x, float y) const { return x + f_(y); }
   __device__ void atomic_reduce(float* x, float y) const { atomicAdd(x, y); }

   F f_;
 };

 // Identity is used for the basic SumReduction
 struct Identity {
   __device__ float operator()(float x) const { return x; }
 };

 struct CudaSumReducer : FnSumReducer<Identity> {
   __host__ __device__ CudaSumReducer() : FnSumReducer(Identity()) {}
 };

 struct CudaMaxReducer {
   // nvcc doesn't recognize numeric_limits<float>::lowest for some reason.
   CudaMaxReducer() {
     bottom_value_ = -3.40282347E+38F;  // std::numeric_limits<float>::lowest();
   }
   __host__ __device__ float bottom_value() { return bottom_value_; }
   __device__ float operator()(float x, float y) const { return fmax(x, y); }

   // This is equivalent to atomicMax(x, y), but CUDA does not have atomicMax for
   // float data type. Instead, this atomically compares-and-swaps the old value
   // at x with y. If the old value returned by the CAS operation was already
   // larger than y, or what was read before, it declares success and finishes,
   // otherwise repeats the procedure.
   __device__ void atomic_reduce(float* x, float y) {
     unsigned int old_val = *reinterpret_cast<unsigned int*>(x);
     while (*reinterpret_cast<float*>(&old_val) < y) {
       unsigned int current_val =
           atomicCAS(reinterpret_cast<unsigned int*>(x), old_val,
                     *reinterpret_cast<unsigned int*>(&y));
       if (old_val == current_val) {
         break;
       }
       old_val = current_val;
     }
   }
   float bottom_value_;
 };

 }  // end namespace

 template <typename Op>
 struct IsFloatSumReduction {
   static const bool value = false;
 };

 template <>
 struct IsFloatSumReduction<SumReducer<float> > {
   static const bool value = true;
 };

 template <typename Op>
 struct IsFloatMaxReduction {
   static const bool value = false;
 };

 template <>
 struct IsFloatMaxReduction<MaxReducer<float> > {
   static const bool value = true;
 };

 template <typename Op>
 struct SumOrMaxOfFloat {
   static const bool value =
       IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
 };

 enum ReductionType { ROW_REDUCE, COL_REDUCE, UNOPTIMIZED };

 template <typename Op, typename Expr, typename ReductionExpr>
 ReductionType GetReductionType(const Expr& expr,
                                const ReductionExpr& reduction_expr,
                                const GpuDevice& device, std::size_t* rows,
                                std::size_t* cols) {
   typedef TensorEvaluator<const Expr, GpuDevice> EvalExpr;
   typedef TensorEvaluator<const ReductionExpr, GpuDevice> ReductionEvalExpr;

   if (device.majorDeviceVersion() < 3) {
     return UNOPTIMIZED;
   }
   const EvalExpr eval_expr(expr, device);

   // We only have fast reductions for sum/max of float.
   if (!SumOrMaxOfFloat<Op>::value) {
     return UNOPTIMIZED;
   }

   if (ReductionEvalExpr::NumReducedDims == ReductionEvalExpr::NumInputDims) {
     return UNOPTIMIZED;
   }

   if (ReductionEvalExpr::NumReducedDims > 1) {
     return UNOPTIMIZED;
   }

   const std::size_t total_size = array_prod(eval_expr.dimensions());
   if (total_size == 0) {
     return UNOPTIMIZED;
   }

   const int dim = reduction_expr.dims()[0];
   if (static_cast<int>(ReductionEvalExpr::Layout) ==
       static_cast<int>(RowMajor)) {
     if (dim == ReductionEvalExpr::NumInputDims - 1) {
       *rows = total_size /
               eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
       *cols = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
       if (*cols < 32) return UNOPTIMIZED;
       return ROW_REDUCE;
       //      return UNOPTIMIZED;
     } else if (dim == 0) {
       *rows = eval_expr.dimensions()[0];
       *cols = total_size / eval_expr.dimensions()[0];
       if (*rows < 32) return UNOPTIMIZED;
       return COL_REDUCE;
       //return UNOPTIMIZED;
     }
   } else if (static_cast<int>(ReductionEvalExpr::Layout) ==
              static_cast<int>(ColMajor)) {
     if (dim == ReductionEvalExpr::NumInputDims - 1) {
       *rows = eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
       *cols = total_size /
               eval_expr.dimensions()[ReductionEvalExpr::NumInputDims - 1];
       if (*rows < 32) return UNOPTIMIZED;
       return COL_REDUCE;
       //return UNOPTIMIZED;
     } else if (dim == 0) {
       *rows = total_size / eval_expr.dimensions()[0];
       *cols = eval_expr.dimensions()[0];
       if (*cols < 32) return UNOPTIMIZED;
       return ROW_REDUCE;
       //return UNOPTIMIZED;
     }
   }
   return UNOPTIMIZED;
 }

 template <typename Expression, typename Index, bool Vectorizable>
 struct LaunchKernel;

 template <typename Expression, typename Index>
 struct LaunchKernel<Expression, Index, true> {
   static void launch(int num_blocks, int block_size, const GpuDevice& device,
                      const TensorEvaluator<Expression, GpuDevice>& evaluator,
                      Index size) {
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
 };

 template <typename Expression, typename Index>
 struct LaunchKernel<Expression, Index, false> {
   static void launch(int num_blocks, int block_size, const GpuDevice& device,
                      const TensorEvaluator<Expression, GpuDevice>& evaluator,
                      Index size) {
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
 };

 template <typename F, typename LHS, typename RHS, bool Compatible>
 struct LaunchRowReduce;

 template <typename F, typename LHS, typename RHS>
 struct LaunchRowReduce<F, LHS, RHS, true> {
   static void launch(const GpuDevice& device, RHS input, std::size_t rows,
                      std::size_t cols, LHS output) {
     RowReduceCuda(F(), device, input, rows, cols, output);
   }
 };

 template <typename F, typename LHS, typename RHS>
 struct LaunchRowReduce<F, LHS, RHS, false> {
   static void launch(const GpuDevice& device, RHS input, std::size_t rows,
                      std::size_t cols, LHS output) {}
 };

 template <typename F, typename LHS, typename RHS, bool Compatible>
 struct LaunchColReduce;

 template <typename F, typename LHS, typename RHS>
 struct LaunchColReduce<F, LHS, RHS, true> {
   static void launch(const GpuDevice& device, RHS input, std::size_t rows,
                      std::size_t cols, LHS output) {
     ColumnReduceCuda(F(), device, input, rows, cols, output);
   }
 };

 template <typename F, typename LHS, typename RHS>
 struct LaunchColReduce<F, LHS, RHS, false> {
   static void launch(const GpuDevice& device, RHS input, std::size_t rows,
                      std::size_t cols, LHS output) {}
 };

 template <typename Expression, typename Device, bool Vectorizable>
 class TensorAssignExecutorHelper;

 template <typename OutExpr, typename InExpr, typename Op, typename Indices,
           bool Vectorizable>
 class TensorAssignExecutorHelper<
     const TensorAssignOp<
       OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
     GpuDevice, Vectorizable> {
  public:
   typedef const TensorAssignOp<
     OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>
     Expression;

   typedef typename Expression::Index Index;
   typedef TensorEvaluator<OutExpr, GpuDevice> LHSEval;
   typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
   static inline void run(const Expression& expr, const GpuDevice& device) {
     std::size_t rows, cols;
     const ReductionType reduction_type =
         GetReductionType<Op>(expr.rhsExpression().expression(),
                              expr.rhsExpression(), device, &rows, &cols);
     if (reduction_type == UNOPTIMIZED) {
       TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
       const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
       if (needs_assign) {
         const int num_blocks = device.getNumCudaMultiProcessors() *
                                device.maxCudaThreadsPerMultiProcessor() /
                                device.maxCudaThreadsPerBlock();
         const int block_size = device.maxCudaThreadsPerBlock();
         const Index size = array_prod(evaluator.dimensions());
         LaunchKernel<Expression, Index, Vectorizable>::launch(
             num_blocks, block_size, device, evaluator, size);
       }
       evaluator.cleanup();
     } else {
       LHSEval output(expr.lhsExpression(), device);
       RHSEval input(expr.rhsExpression().expression(), device);
       bool lhs_needs_assign = output.evalSubExprsIfNeeded(NULL);
       bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
       if (lhs_needs_assign && rhs_needs_assign) {
         const bool Compatible =
             IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
         if (reduction_type == ROW_REDUCE) {
           if (IsFloatSumReduction<Op>::value) {
             LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else if (IsFloatMaxReduction<Op>::value) {
             LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else {
             // Unsupported reduction type
             assert(false && "Unsupported reduction function for ROW_REDUCE");
           }
         } else {
           if (IsFloatSumReduction<Op>::value) {
             LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else if (IsFloatMaxReduction<Op>::value) {
             LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else {
             // Unsupported reduction type
             assert(false && "Unsupported reduction function for COL_REDUCE");
           }
         }
       }
       input.cleanup();
       output.cleanup();
     }
   }
 };

 template <typename OutExpr, typename InExpr, typename Op, typename Indices,
           bool Tileable>
 inline void TensorExecutor<
     const TensorAssignOp<
         OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
     GpuDevice, false, Tileable>::run(const Expression& expr,
                                      const GpuDevice& device) {
   TensorAssignExecutorHelper<
       const TensorAssignOp<
           OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
       GpuDevice, false>::run(expr, device);
 }

 template <typename OutExpr, typename InExpr, typename Op, typename Indices,
           bool Tileable>
 inline void TensorExecutor<
     const TensorAssignOp<
         OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
     GpuDevice, true, Tileable>::run(const Expression& expr,
                                     const GpuDevice& device) {
   TensorAssignExecutorHelper<
       const TensorAssignOp<
           OutExpr, TensorReductionOp<Op, Indices const, InExpr const> const>,
       GpuDevice, true>::run(expr, device);
 }

 template <typename T, typename Index>
 struct PtrWrapper {
   EIGEN_DEVICE_FUNC PtrWrapper(T* ptr) : m_ptr(ptr) {}
   EIGEN_DEVICE_FUNC T& coeffRef(Index i) { return *(m_ptr + i); }
   T* m_ptr;
 };

 template <typename Expression, typename Device, bool Vectorizable>
 class TensorEvalToExecutorHelper;

 template <typename InExpr, typename Op, typename Indices, bool Vectorizable>
 class TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
                                      Op, const Indices, const InExpr> >,
                                  GpuDevice, Vectorizable> {
  public:
   typedef const TensorEvalToOp<const TensorReductionOp<
       Op, const Indices, const InExpr> > Expression;
   typedef typename Expression::Index Index;
   typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;

   static inline void run(const Expression& expr, const GpuDevice& device) {
     std::size_t rows, cols;
     const ReductionType reduction_type =
         GetReductionType<Op>(expr.expression().expression(), expr.expression(),
                              device, &rows, &cols);
     if (reduction_type == UNOPTIMIZED) {
       TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
       const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
       if (needs_assign) {
         const int num_blocks = device.getNumCudaMultiProcessors() *
                                device.maxCudaThreadsPerMultiProcessor() /
                                device.maxCudaThreadsPerBlock();
         const int block_size = device.maxCudaThreadsPerBlock();
         const Index size = array_prod(evaluator.dimensions());
         LaunchKernel<Expression, Index, Vectorizable>::launch(
             num_blocks, block_size, device, evaluator, size);
       }
       evaluator.cleanup();
     } else {
       typedef typename internal::remove_const<typename Expression::Scalar>::type Scalar;
       PtrWrapper<Scalar, Index> output(expr.buffer());
       TensorEvaluator<const InExpr, GpuDevice> input(
           expr.expression().expression(), device);
       typedef PtrWrapper<Scalar, Index> LHSEval;
       typedef TensorEvaluator<const InExpr, GpuDevice> RHSEval;
       bool rhs_needs_assign = input.evalSubExprsIfNeeded(NULL);
       if (rhs_needs_assign) {
         const bool Compatible =
             IsFloatSumReduction<Op>::value || IsFloatMaxReduction<Op>::value;
         if (reduction_type == ROW_REDUCE) {
           if (IsFloatSumReduction<Op>::value) {
             LaunchRowReduce<CudaSumReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else if (IsFloatMaxReduction<Op>::value) {
             LaunchRowReduce<CudaMaxReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           }
         } else {
           if (IsFloatSumReduction<Op>::value) {
             LaunchColReduce<CudaSumReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           } else if (IsFloatMaxReduction<Op>::value) {
             LaunchColReduce<CudaMaxReducer, LHSEval, RHSEval,
                             Compatible>::launch(device, input, rows, cols,
                                                 output);
           }
         }
       }
       input.cleanup();
     }
   }
 };

 template <typename InExpr, typename Op, typename Indices, bool Tileable>
 inline void
 TensorExecutor<const TensorEvalToOp<
                    const TensorReductionOp<Op, const Indices, const InExpr> >,
                GpuDevice, false, Tileable>::run(const Expression& expr,
                                                 const GpuDevice& device) {
   TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
                                  Op, const Indices, const InExpr> >,
                              GpuDevice, false>::run(expr, device);
 }

 template <typename InExpr, typename Op, typename Indices, bool Tileable>
 inline void
 TensorExecutor<const TensorEvalToOp<
                    const TensorReductionOp<Op, const Indices, const InExpr> >,
                GpuDevice, true, Tileable>::run(const Expression& expr,
                                                const GpuDevice& device) {
   TensorEvalToExecutorHelper<const TensorEvalToOp<const TensorReductionOp<
                                  Op, const Indices, const InExpr> >,
                              GpuDevice, true>::run(expr, device);
 }

 }  // end namespace internal

 }  // end namespace Eigen

 #endif  // __CUDACC__
 #endif  // EIGEN_USE_GPU
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H