unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: eigen@codeplay.com
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 // General include header of SYCL target for Tensor Module
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H

 namespace Eigen {
 namespace TensorSycl {
 namespace internal {

   template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
     OP op;
     OutputAccessor aOut;
     ptrdiff_t out_offset;
     InputAccessor aI;
     LocalAccessor scratch;
     size_t length, local;
     GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
     : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
     void operator()(cl::sycl::nd_item<1> itemID) {
       size_t globalid = itemID.get_global(0);
       size_t localid = itemID.get_local(0);
       /* All threads collectively read from global memory into local.
        * The barrier ensures all threads' IO is resolved before
        * execution continues (strictly speaking, all threads within
        * a single work-group - there is no co-ordination between
        * work-groups, only work-items). */
       if (globalid < length) {
         scratch[localid] = aI[globalid];
       }
       itemID.barrier(cl::sycl::access::fence_space::local_space);

       /* Apply the reduction operation between the current local
        * id and the one on the other half of the vector. */
       if (globalid < length) {
         auto min = (length < local) ? length : local;
         for (size_t offset = min / 2; offset > 0; offset /= 2) {
           if (localid < offset) {
             auto accum = op.initialize();
             op.reduce(scratch[localid], &accum);
             op.reduce(scratch[localid + offset], &accum);
             op.finalize(accum);
             scratch[localid]=accum;
             //scratch[localid] += scratch[localid + offset];
           }
           itemID.barrier(cl::sycl::access::fence_space::local_space);
         }
         /* The final result will be stored in local id 0. */
         if (localid == 0) {
           aI[itemID.get_group(0)] = scratch[localid];
           if((length<=local) && globalid ==0){
             auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
             aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
           }
         }
       }
     }

   };

 /// ReductionFunctor
 template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
  public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
   typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
   ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
   :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
   void operator()(cl::sycl::nd_item<1> itemID) {

     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
     /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
     typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
     auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=static_cast<Index>(itemID.get_global_linear_id());
     if (globalid< range) {
       typename DeviceSelf::CoeffReturnType accum = functor.initialize();
       Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
       functor.finalize(accum);
       output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
     }
   }
  private:
   write_accessor output_accessor;
   ptrdiff_t out_offset;
   FunctorExpr functors;
   Tuple_of_Acc tuple_of_accessors;
   Dims dims;
   Op functor;
   Index range;
 };

 template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
 class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
  public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
   typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
   typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
   ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
     Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
   :output_accessor(output_accessor_),  out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
   void operator()(cl::sycl::nd_item<1> itemID) {

     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
     /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
     typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
     auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=static_cast<Index>(itemID.get_global_linear_id());
     if (globalid< range) {
       typename DeviceSelf::CoeffReturnType accum = functor.initialize();
       Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
       functor.finalize(accum);
       output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
     }
   }
  private:
   write_accessor output_accessor;
   ptrdiff_t out_offset;
   FunctorExpr functors;
   Tuple_of_Acc tuple_of_accessors;
   Dims dims;
   Op functor;
   Index range;
   Index num_values_to_reduce;
 };

 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor{
 public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
   OutAccessor tmp_global_accessor;
   Index rng , remaining, red_factor;
   Op op;
   Dims dims;
   FunctorExpr functors;
   TupleType tuple_of_accessors;

   FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
   :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

   void operator()(cl::sycl::nd_item<1> itemID) {

     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
     /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
     auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=itemID.get_global_linear_id();

     tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
     : static_cast<CoeffReturnType>(op.initialize());

     if(remaining!=0 && globalid==0 ){
       // this will add the rest of input buffer when the input size is not devidable to red_factor.
       auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
       reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
       auto accum = op.initialize();
       op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
       op.reduce(remaining_reduce, &accum);
       op.finalize(accum);
       tmp_global_accessor.get_pointer()[0]=accum;

     }
   }
 };

 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
 public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
   typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

   OutAccessor tmp_global_accessor;
   Index rng , remaining, red_factor;
   Op op;
   Dims dims;
   FunctorExpr functors;
   TupleType tuple_of_accessors;

   FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
   :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

   void operator()(cl::sycl::nd_item<1> itemID) {

     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
     /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
     auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=itemID.get_global_linear_id();
     auto scale = (rng*red_factor) + remaining;

     tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
     :static_cast<CoeffReturnType>(op.initialize())/scale;

     if(remaining!=0 && globalid==0 ){
       // this will add the rest of input buffer when the input size is not devidable to red_factor.
       auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
       auto accum = op.initialize();
       tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
       op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
       op.reduce(remaining_reduce, &accum);
       op.finalize(accum);
       tmp_global_accessor.get_pointer()[0]=accum/scale;

     }
   }
 };

 }
 }
 }
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Mehdi Goli Codeplay Software Ltd.
	// Ralph Potter Codeplay Software Ltd.
	// Luke Iwanski Codeplay Software Ltd.
	// Contact: eigen@codeplay.com
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	// General include header of SYCL target for Tensor Module
	#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
	#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H

	namespace Eigen {
	namespace TensorSycl {
	namespace internal {

	template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
	OP op;
	OutputAccessor aOut;
	ptrdiff_t out_offset;
	InputAccessor aI;
	LocalAccessor scratch;
	size_t length, local;
	GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
	: op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
	void operator()(cl::sycl::nd_item<1> itemID) {
	size_t globalid = itemID.get_global(0);
	size_t localid = itemID.get_local(0);
	/* All threads collectively read from global memory into local.
	* The barrier ensures all threads' IO is resolved before
	* execution continues (strictly speaking, all threads within
	* a single work-group - there is no co-ordination between
	* work-groups, only work-items). */
	if (globalid < length) {
	scratch[localid] = aI[globalid];
	}
	itemID.barrier(cl::sycl::access::fence_space::local_space);

	/* Apply the reduction operation between the current local
	* id and the one on the other half of the vector. */
	if (globalid < length) {
	auto min = (length < local) ? length : local;
	for (size_t offset = min / 2; offset > 0; offset /= 2) {
	if (localid < offset) {
	auto accum = op.initialize();
	op.reduce(scratch[localid], &accum);
	op.reduce(scratch[localid + offset], &accum);
	op.finalize(accum);
	scratch[localid]=accum;
	//scratch[localid] += scratch[localid + offset];
	}
	itemID.barrier(cl::sycl::access::fence_space::local_space);
	}
	/* The final result will be stored in local id 0. */
	if (localid == 0) {
	aI[itemID.get_group(0)] = scratch[localid];
	if((length<=local) && globalid ==0){
	auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
	aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
	}
	}
	}
	}

	};

	/// ReductionFunctor
	template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
	public:
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
	ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
	:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
	void operator()(cl::sycl::nd_item<1> itemID) {

	typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
	auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=static_cast<Index>(itemID.get_global_linear_id());
	if (globalid< range) {
	typename DeviceSelf::CoeffReturnType accum = functor.initialize();
	Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
	functor.finalize(accum);
	output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
	}
	}
	private:
	write_accessor output_accessor;
	ptrdiff_t out_offset;
	FunctorExpr functors;
	Tuple_of_Acc tuple_of_accessors;
	Dims dims;
	Op functor;
	Index range;
	};

	template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
	class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
	public:
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
	typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
	ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
	Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
	:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
	void operator()(cl::sycl::nd_item<1> itemID) {

	typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
	auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=static_cast<Index>(itemID.get_global_linear_id());
	if (globalid< range) {
	typename DeviceSelf::CoeffReturnType accum = functor.initialize();
	Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
	functor.finalize(accum);
	output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
	}
	}
	private:
	write_accessor output_accessor;
	ptrdiff_t out_offset;
	FunctorExpr functors;
	Tuple_of_Acc tuple_of_accessors;
	Dims dims;
	Op functor;
	Index range;
	Index num_values_to_reduce;
	};

	template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
	class FullReductionKernelFunctor{
	public:
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	OutAccessor tmp_global_accessor;
	Index rng , remaining, red_factor;
	Op op;
	Dims dims;
	FunctorExpr functors;
	TupleType tuple_of_accessors;

	FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
	:tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

	void operator()(cl::sycl::nd_item<1> itemID) {

	typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=itemID.get_global_linear_id();

	tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
	: static_cast<CoeffReturnType>(op.initialize());

	if(remaining!=0 && globalid==0 ){
	// this will add the rest of input buffer when the input size is not devidable to red_factor.
	auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
	reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
	auto accum = op.initialize();
	op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
	op.reduce(remaining_reduce, &accum);
	op.finalize(accum);
	tmp_global_accessor.get_pointer()[0]=accum;

	}
	}
	};

	template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
	class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
	public:
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

	OutAccessor tmp_global_accessor;
	Index rng , remaining, red_factor;
	Op op;
	Dims dims;
	FunctorExpr functors;
	TupleType tuple_of_accessors;

	FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
	:tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

	void operator()(cl::sycl::nd_item<1> itemID) {

	typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=itemID.get_global_linear_id();
	auto scale = (rng*red_factor) + remaining;

	tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
	:static_cast<CoeffReturnType>(op.initialize())/scale;

	if(remaining!=0 && globalid==0 ){
	// this will add the rest of input buffer when the input size is not devidable to red_factor.
	auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
	auto accum = op.initialize();
	tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
	op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
	op.reduce(remaining_reduce, &accum);
	op.finalize(accum);
	tmp_global_accessor.get_pointer()[0]=accum/scale;

	}
	}
	};

	}
	}
	}
	#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H