unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 /*****************************************************************
  * TensorReductionSycl.h
  *
  * \brief:
  *  This is the specialization of the reduction operation. Two phase reduction approach
  * is used since the GPU does not have Global Synchronization for global memory among
  * different work-group/thread block. To solve the problem, we need to create two kernels
  * to reduce the data, where the first kernel reduce the data locally and each local
  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
  *
  *****************************************************************/

 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 namespace Eigen {
 namespace TensorSycl {
 namespace internal {

 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
 struct OpDefiner {
   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
   typedef Op type;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
                                                                             const Index &) {
     return accumulator;
   }
 };

 template <typename CoeffReturnType, typename Index>
 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
   typedef Eigen::internal::SumReducer<CoeffReturnType> type;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
     return type();
   }

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
                                                                            const Index &scale) {
     ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
     return quotient_op(accumulator, CoeffReturnType(scale));
   }
 };

 template <typename CoeffReturnType, typename Index>
 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
   typedef Eigen::internal::SumReducer<CoeffReturnType> type;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
     return type();
   }

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
                                                                             const Index &scale) {
     return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
   }
 };

 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
           Index local_range>
 struct SecondStepFullReducer {
   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       LocalAccessor;
   typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
   typedef typename OpDef::type Op;
   LocalAccessor scratch;
   InputAccessor aI;
   OutputAccessor outAcc;
   Op op;
   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}

   void operator()(cl::sycl::nd_item<1> itemID) const {
     // Our empirical research shows that the best performance will be achieved
     // when there is only one element per thread to reduce in the second step.
     // in this step the second step reduction time is almost negligible.
     // Hence, in the second step of reduction the input size is fixed to the
     // local size, thus, there is only one element read per thread. The
     // algorithm must be changed if the number of reduce per thread in the
     // second step is greater than 1. Otherwise, the result will be wrong.
     const Index localid = itemID.get_local_id(0);
     auto aInPtr = aI + localid;
     auto aOutPtr = outAcc;
     CoeffReturnType *scratchptr = scratch.get_pointer();
     CoeffReturnType accumulator = *aInPtr;

     scratchptr[localid] = op.finalize(accumulator);
     for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
       itemID.barrier(cl::sycl::access::fence_space::local_space);
       if (localid < offset) {
         op.reduce(scratchptr[localid + offset], &accumulator);
         scratchptr[localid] = op.finalize(accumulator);
       }
     }
     if (localid == 0) *aOutPtr = op.finalize(accumulator);
   }
 };

 // Full reduction first phase. In this version the vectorization is true and the reduction accept
 // any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ).
 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
 class FullReductionKernelFunctor {
  public:
   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
   typedef typename Evaluator::Index Index;
   typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
                     (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
       OpDef;

   typedef typename OpDef::type Op;
   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Evaluator::PacketReturnType PacketReturnType;
   typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
                              CoeffReturnType>
       OutType;
   typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       LocalAccessor;
   LocalAccessor scratch;
   Evaluator evaluator;
   EvaluatorPointerType final_output;
   Index rng;
   Op op;

   FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
                              Index rng_, OpType op_)
       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}

   void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }

   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
       const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output;
     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index globalid = itemID.get_global_id(0);
     Index localid = itemID.get_local_id(0);
     Index step = Evaluator::PacketSize * itemID.get_global_range(0);
     Index start = Evaluator::PacketSize * globalid;
     // vectorizable parts
     PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
     for (Index i = start; i < VectorizedRange; i += step) {
       op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
     }
     globalid += VectorizedRange;
     // non vectorizable parts
     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
       op.template reducePacket<PacketReturnType>(
           ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
               evaluator.impl().coeff(i), op.initialize()),
           &packetAccumulator);
     }
     scratch[localid] = packetAccumulator =
         OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
     // reduction parts // Local size is always power of 2
     EIGEN_UNROLL_LOOP
     for (Index offset = local_range / 2; offset > 0; offset /= 2) {
       itemID.barrier(cl::sycl::access::fence_space::local_space);
       if (localid < offset) {
         op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
         scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
       }
     }
     if (localid == 0) {
       output_ptr[itemID.get_group(0)] =
           op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
     }
   }

   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
       const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output;
     Index globalid = itemID.get_global_id(0);
     Index localid = itemID.get_local_id(0);
     // vectorizable parts
     CoeffReturnType accumulator = op.initialize();
     // non vectorizable parts
     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
       op.reduce(evaluator.impl().coeff(i), &accumulator);
     }
     scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);

     // reduction parts. the local size is always power of 2
     EIGEN_UNROLL_LOOP
     for (Index offset = local_range / 2; offset > 0; offset /= 2) {
       itemID.barrier(cl::sycl::access::fence_space::local_space);
       if (localid < offset) {
         op.reduce(scratch[localid + offset], &accumulator);
         scratch[localid] = op.finalize(accumulator);
       }
     }
     if (localid == 0) {
       output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
     }
   }
 };

 template <typename Evaluator, typename OpType>
 class GenericNondeterministicReducer {
  public:
   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Evaluator::Index Index;
   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
   typedef typename OpDef::type Op;
   template <typename Scratch>
   GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
                                  Index range_, Index num_values_to_reduce_)
       : evaluator(evaluator_),
         output_accessor(output_accessor_),
         functor(OpDef::get_op(functor_)),
         range(range_),
         num_values_to_reduce(num_values_to_reduce_) {}

   void operator()(cl::sycl::nd_item<1> itemID) const {
     // This is to bypass the statefull condition in Eigen meanReducer
     Op non_const_functor;
     std::memcpy(&non_const_functor, &functor, sizeof(Op));
     auto output_accessor_ptr = output_accessor;
     Index globalid = static_cast<Index>(itemID.get_global_linear_id());
     if (globalid < range) {
       CoeffReturnType accum = functor.initialize();
       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
           evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
     }
   }

  private:
   Evaluator evaluator;
   EvaluatorPointerType output_accessor;
   Op functor;
   Index range;
   Index num_values_to_reduce;
 };

 enum class reduction_dim { inner_most, outer_most };
 // default is preserver
 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
 struct PartialReductionKernel {
   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Evaluator::Index Index;
   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
   typedef typename OpDef::type Op;
   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       ScratchAcc;
   ScratchAcc scratch;
   Evaluator evaluator;
   EvaluatorPointerType output_accessor;
   Op op;
   const Index preserve_elements_num_groups;
   const Index reduce_elements_num_groups;
   const Index num_coeffs_to_preserve;
   const Index num_coeffs_to_reduce;

   PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
                          const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
                          const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
       : scratch(scratch_),
         evaluator(evaluator_),
         output_accessor(output_accessor_),
         op(OpDef::get_op(op_)),
         preserve_elements_num_groups(preserve_elements_num_groups_),
         reduce_elements_num_groups(reduce_elements_num_groups_),
         num_coeffs_to_preserve(num_coeffs_to_preserve_),
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
                                                                  CoeffReturnType &accumulator) const {
     if (globalPId >= num_coeffs_to_preserve) {
       return;
     }
     Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
                                                           : globalRId + (globalPId * num_coeffs_to_reduce);
     Index localOffset = globalRId;

     const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
     const Index per_thread_global_stride =
         rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
     for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
       op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
       localOffset += per_thread_local_stride;
       global_offset += per_thread_global_stride;
     }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index linearLocalThreadId = itemID.get_local_id(0);
     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
     Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
                                                            : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
     const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
                                                            : itemID.get_group(0) / reduce_elements_num_groups;
     const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
                                                            : itemID.get_group(0) % reduce_elements_num_groups;

     Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
     const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
     CoeffReturnType *scratchPtr = scratch.get_pointer();
     auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
     CoeffReturnType accumulator = op.initialize();

     element_wise_reduce(globalRId, globalPId, accumulator);

     accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
     scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
         accumulator;
     if (rt == reduction_dim::inner_most) {
       pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
       rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
       globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
     }

     /* Apply the reduction operation between the current local
      * id and the one on the other half of the vector. */
     auto out_scratch_ptr =
         scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
     itemID.barrier(cl::sycl::access::fence_space::local_space);
     if (rt == reduction_dim::inner_most) {
       accumulator = *out_scratch_ptr;
     }
     // The Local LocalThreadSizeR is always power of 2
     EIGEN_UNROLL_LOOP
     for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
       if (rLocalThreadId < offset) {
         op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
         // The result has already been divided for mean reducer in the
         // previous reduction so no need to divide furthermore
         *out_scratch_ptr = op.finalize(accumulator);
       }
       /* All threads collectively read from global memory into local.
        * The barrier ensures all threads' IO is resolved before
        * execution continues (strictly speaking, all threads within
        * a single work-group - there is no co-ordination between
        * work-groups, only work-items). */
       itemID.barrier(cl::sycl::access::fence_space::local_space);
     }

     if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
       outPtr[globalPId] = op.finalize(accumulator);
     }
   }
 };

 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
 struct SecondStepPartialReduction {
   typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
   typedef typename OpDef::type Op;
   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
       ScratchAccessor;
   InputAccessor input_accessor;
   OutputAccessor output_accessor;
   Op op;
   const Index num_coeffs_to_preserve;
   const Index num_coeffs_to_reduce;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
                                                                    OutputAccessor output_accessor_, OpType op_,
                                                                    const Index num_coeffs_to_preserve_,
                                                                    const Index num_coeffs_to_reduce_)
       : input_accessor(input_accessor_),
         output_accessor(output_accessor_),
         op(OpDef::get_op(op_)),
         num_coeffs_to_preserve(num_coeffs_to_preserve_),
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index globalId = itemID.get_global_id(0);

     if (globalId >= num_coeffs_to_preserve) return;

     auto in_ptr = input_accessor + globalId;

     OutScalar accumulator = op.initialize();
     // num_coeffs_to_reduce is not bigger that 256
     for (Index i = 0; i < num_coeffs_to_reduce; i++) {
       op.reduce(*in_ptr, &accumulator);
       in_ptr += num_coeffs_to_preserve;
     }
     output_accessor[globalId] = op.finalize(accumulator);
   }
 };  // namespace internal

 template <typename Index, Index LTP, Index LTR, bool BC_>
 struct ReductionPannel {
   static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
   static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
   static EIGEN_CONSTEXPR bool BC = BC_;
 };

 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
 struct PartialReducerLauncher {
   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Self::CoeffReturnType CoeffReturnType;
   typedef typename Self::Storage Storage;
   typedef typename Self::Index Index;
   typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
       PannelParameters;

   typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;

   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
                   Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
     Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);

     // getPowerOfTwo makes sure local range is power of 2 and <=
     // maxSyclThreadPerBlock this will help us to avoid extra check on the
     // kernel
     static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
                     (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
                   "The Local thread size must be a power of 2 for the reduction "
                   "operation");

     EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
     // In this step, we force the code not to be more than 2-step reduction:
     // Our empirical research shows that if each thread reduces at least 64
     // elements individually, we get better performance. However, this can change
     // on different platforms. In this step we force the code not to be
     // morthan step reduction: Our empirical research shows that for inner_most
     // dim reducer, it is better to have 8 group in a reduce dimension for sizes
     // > 1024 to achieve the best performance.
     const Index reductionPerThread = 64;
     Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
     const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
     Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
     const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
     const Index globalRange = pNumGroups * rNumGroups * localRange;

     EIGEN_CONSTEXPR Index scratchSize =
         PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
     if (rNumGroups > 1) {
       CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
           dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
       EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
              self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
              num_coeffs_to_reduce)
           .wait();
       typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
           SecondStepPartialReductionKernel;
       dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
              temp_accessor, output,
              cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
              Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
           .wait();
       self.device().deallocate_temp(temp_pointer);
     } else {
       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
              self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
              num_coeffs_to_reduce)
           .wait();
     }
     return false;
   }
 };
 }  // namespace internal
 }  // namespace TensorSycl

 namespace internal {

 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
   typedef typename Self::CoeffReturnType CoeffReturnType;
   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
   static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
   static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
     typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
     static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
                     (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
                   "The Local thread size must be a power of 2 for the reduction "
                   "operation");
     EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;

     typename Self::Index inputSize = self.impl().dimensions().TotalSize();
     // In this step we force the code not to be more than 2-step reduction:
     // Our empirical research shows that if each thread reduces at least 512
     // elements individually, we get better performance.
     const Index reductionPerThread = 2048;
     // const Index num_work_group =
     Index reductionGroup = dev.getPowerOfTwo(
         (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
     const Index num_work_group = std::min(reductionGroup, local_range);
     // 1
     // ? local_range
     // : 1);
     const Index global_range = num_work_group * local_range;

     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
     typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
     if (num_work_group > 1) {
       CoeffReturnType *temp_pointer =
           static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
       typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
                                                                       local_range, inputSize, reducer)
           .wait();
       typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
                                                           EvaluatorPointerType, Index, local_range>
           GenericRKernel;
       dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
              tmp_global_accessor, data,
              cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
              num_work_group, reducer)
           .wait();
       dev.deallocate_temp(temp_pointer);
     } else {
       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
                                                                       reducer)
           .wait();
     }
   }
 };
 // vectorizable inner_most most dim preserver
 // col reduction
 template <typename Self, typename Op>
 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;

   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
                   typename Self::Index num_coeffs_to_preserve) {
     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
                                                                                  num_coeffs_to_reduce,
                                                                                  num_coeffs_to_preserve);
   }
 };
 // row reduction
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;

   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
                   typename Self::Index num_coeffs_to_preserve) {
     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
                                                                                  num_coeffs_to_reduce,
                                                                                  num_coeffs_to_preserve);
   }
 };

 // ArmgMax uses this kernel for partial reduction//
 // TODO(@mehdi.goli) come up with a better kernel
 // generic partial reduction
 template <typename Self, typename Op>
 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                   typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
                   typename Self::Index num_coeffs_to_preserve) {
     typename Self::Index range, GRange, tileSize;
     dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);

     dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
                                        TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
            self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
            reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
         .wait();
     return false;
   }
 };

 }  // namespace internal
 }  // namespace Eigen

 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP