Update Eigen to commit:464c1d097891a1462ab28bf8bb763c1683883892 CHANGELOG ========= 464c1d097 - Format TensorDeviceThreadPool.h & use if constexpr for c++20. 21223f6bb - Fix addition of different enum types. 350544eb0 - Clean up TensorDeviceThreadPool.h 43810fc1b - Fix extra semicolon in DeviceWrapper d28041ed5 - refactor AssignmentFunctors.h, unify with existing scalar_op 9a8621403 - Optimize division operations in TensorVolumePatch.h be5147b09 - Fix STL feature detection for c++20. 179a49684 - Fix CMake BOOST warning PiperOrigin-RevId: 735405240 Change-Id: Ie55bcb8e50a5323c276230c5a89ac630c547bfb1
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h index 3ae8256..75fc8e7 100644 --- a/Eigen/src/Core/DeviceWrapper.h +++ b/Eigen/src/Core/DeviceWrapper.h
@@ -66,7 +66,7 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func, Device&) { Base::run(dst, src, func); - }; + } }; // specialization for coeffcient-wise assignment
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h index 25d4575..4576cc0 100644 --- a/Eigen/src/Core/StlIterators.h +++ b/Eigen/src/Core/StlIterators.h
@@ -325,7 +325,7 @@ public: typedef Index difference_type; typedef typename XprType::Scalar value_type; -#if EIGEN_CPLUSPLUS >= 202002L +#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L typedef std::conditional_t<XprType::InnerStrideAtCompileTime == 1, std::contiguous_iterator_tag, std::random_access_iterator_tag> iterator_category;
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 3687bb2..7edcc60 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -27,7 +27,7 @@ template <int Alignment, typename Packet> EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { - internal::pstoret<DstScalar, Packet, Alignment>(a, b); + pstoret<DstScalar, Packet, Alignment>(a, b); } }; @@ -36,7 +36,7 @@ struct assign_op<DstScalar, void> {}; template <typename DstScalar, typename SrcScalar> -struct functor_traits<assign_op<DstScalar, SrcScalar> > { +struct functor_traits<assign_op<DstScalar, SrcScalar>> { enum { Cost = NumTraits<DstScalar>::ReadCost, PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && @@ -45,88 +45,70 @@ }; /** \internal - * \brief Template functor for scalar/packet assignment with addition + * \brief Template functor for scalar/packet compound assignment * */ -template <typename DstScalar, typename SrcScalar> -struct add_assign_op { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; } +template <typename DstScalar, typename SrcScalar, typename Func> +struct compound_assign_op { + using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { + assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b)); + } template <int Alignment, typename Packet> EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { - internal::pstoret<DstScalar, Packet, Alignment>(a, internal::padd(internal::ploadt<Packet, Alignment>(a), b)); + assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>( + a, Func().packetOp(ploadt<Packet, Alignment>(a), b)); } }; -template <typename DstScalar, typename SrcScalar> -struct functor_traits<add_assign_op<DstScalar, SrcScalar> > { + +template <typename DstScalar, typename SrcScalar, typename Func> +struct functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>> { enum { - Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost, - PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasAdd + Cost = int(functor_traits<assign_op<DstScalar, DstScalar>>::Cost) + int(functor_traits<Func>::Cost), + PacketAccess = functor_traits<assign_op<DstScalar, DstScalar>>::PacketAccess && functor_traits<Func>::PacketAccess }; }; /** \internal + * \brief Template functor for scalar/packet assignment with addition + * + */ +template <typename DstScalar, typename SrcScalar = DstScalar> +struct add_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_sum_op<DstScalar, SrcScalar>> {}; + +template <typename DstScalar, typename SrcScalar> +struct functor_traits<add_assign_op<DstScalar, SrcScalar>> : add_assign_op<DstScalar, SrcScalar>::traits {}; + +/** \internal * \brief Template functor for scalar/packet assignment with subtraction * */ -template <typename DstScalar, typename SrcScalar> -struct sub_assign_op { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; } +template <typename DstScalar, typename SrcScalar = DstScalar> +struct sub_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_difference_op<DstScalar, SrcScalar>> {}; - template <int Alignment, typename Packet> - EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { - internal::pstoret<DstScalar, Packet, Alignment>(a, internal::psub(internal::ploadt<Packet, Alignment>(a), b)); - } -}; template <typename DstScalar, typename SrcScalar> -struct functor_traits<sub_assign_op<DstScalar, SrcScalar> > { - enum { - Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost, - PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasSub - }; -}; +struct functor_traits<sub_assign_op<DstScalar, SrcScalar>> : sub_assign_op<DstScalar, SrcScalar>::traits {}; /** \internal * \brief Template functor for scalar/packet assignment with multiplication * */ template <typename DstScalar, typename SrcScalar = DstScalar> -struct mul_assign_op { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; } +struct mul_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_product_op<DstScalar, SrcScalar>> {}; - template <int Alignment, typename Packet> - EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { - internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pmul(internal::ploadt<Packet, Alignment>(a), b)); - } -}; template <typename DstScalar, typename SrcScalar> -struct functor_traits<mul_assign_op<DstScalar, SrcScalar> > { - enum { - Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost, - PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasMul - }; -}; +struct functor_traits<mul_assign_op<DstScalar, SrcScalar>> : mul_assign_op<DstScalar, SrcScalar>::traits {}; /** \internal - * \brief Template functor for scalar/packet assignment with diviving + * \brief Template functor for scalar/packet assignment with dividing * */ template <typename DstScalar, typename SrcScalar = DstScalar> -struct div_assign_op { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; } +struct div_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_quotient_op<DstScalar, SrcScalar>> {}; - template <int Alignment, typename Packet> - EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { - internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pdiv(internal::ploadt<Packet, Alignment>(a), b)); - } -}; template <typename DstScalar, typename SrcScalar> -struct functor_traits<div_assign_op<DstScalar, SrcScalar> > { - enum { - Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost, - PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasDiv - }; -}; +struct functor_traits<div_assign_op<DstScalar, SrcScalar>> : div_assign_op<DstScalar, SrcScalar>::traits {}; /** \internal * \brief Template functor for scalar/packet assignment with swapping @@ -158,7 +140,7 @@ } }; template <typename Scalar> -struct functor_traits<swap_assign_op<Scalar> > { +struct functor_traits<swap_assign_op<Scalar>> { enum { Cost = 3 * NumTraits<Scalar>::ReadCost, PacketAccess =
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 97cf4f3..0facd26 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h
@@ -1316,7 +1316,8 @@ * This wraps C++20's std::construct_at, using placement new instead if it is not available. */ -#if EIGEN_COMP_CXXVER >= 20 +#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_constexpr_dynamic_alloc) && \ + __cpp_lib_constexpr_dynamic_alloc >= 201907L using std::construct_at; #else template <class T, class... Args>
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 39a117e..40604f8 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h
@@ -220,7 +220,7 @@ * * Post C++17: Uses std::void_t */ -#if EIGEN_COMP_CXXVER >= 17 +#if EIGEN_COMP_CXXVER >= 17 && defined(__cpp_lib_void_t) && __cpp_lib_void_t >= 201411L using std::void_t; #else template <typename...> @@ -338,7 +338,16 @@ * * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_STRICT_LESS_THAN(10, 0, 0) +#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L + +template <typename T> +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward<T>(x)); +} + +#else + template <typename T> EIGEN_CONSTEXPR auto index_list_size(const T& x) { using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>; @@ -349,13 +358,7 @@ EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } -#else -template <typename T> -EIGEN_CONSTEXPR auto index_list_size(T&& x) { - using std::ssize; - return ssize(std::forward<T>(x)); -} -#endif // EIGEN_COMP_CXXVER +#endif /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or @@ -745,7 +748,7 @@ inline constexpr bool check_implication(bool a, bool b) { return !a || b; } /// \internal Provide fallback for std::is_constant_evaluated for pre-C++20. -#if EIGEN_COMP_CXXVER >= 20 +#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L using std::is_constant_evaluated; #else constexpr bool is_constant_evaluated() { return false; }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3a67ab1..813cc53 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt
@@ -384,7 +384,7 @@ endif() # boost MP unit test -find_package(Boost 1.53.0) +find_package(Boost 1.53.0 CONFIG) if(Boost_FOUND) include_directories(${Boost_INCLUDE_DIRS}) ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 288d79f..99e7304 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -932,7 +932,7 @@ kernel(m, n, k, use_thread_local); } else { eigen_assert(!use_thread_local); - device_.enqueueNoNotification([this, m, n, k, use_thread_local]() { + device_.enqueue([this, m, n, k, use_thread_local]() { kernel(m, n, k, use_thread_local); }); } @@ -982,7 +982,7 @@ } else { while (end - start > 1) { Index mid = (start + end) / 2; - device_.enqueueNoNotification([this, mid, end, k, rhs]() { + device_.enqueue([this, mid, end, k, rhs]() { enqueue_packing_helper(mid, end, k, rhs); }); end = mid; @@ -1000,7 +1000,7 @@ (k > 0 || std::this_thread::get_id() == created_by_thread_id_); if (pack_async) { - device_.enqueueNoNotification([this, start, end, k, rhs]() { + device_.enqueue([this, start, end, k, rhs]() { enqueue_packing_helper(start, end, k, rhs); }); } else { @@ -1264,7 +1264,7 @@ void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { while (end_block_idx - start_block_idx > 1) { Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification([this, &barrier, mid_block_idx, end_block_idx]() { + evaluator->m_device.enqueue([this, &barrier, mid_block_idx, end_block_idx]() { eval<Alignment>(barrier, mid_block_idx, end_block_idx); }); end_block_idx = mid_block_idx; @@ -1282,7 +1282,7 @@ void evalAsync(Index start_block_idx, Index end_block_idx) { while (end_block_idx - start_block_idx > 1) { Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification( + evaluator->m_device.enqueue( [this, mid_block_idx, end_block_idx]() { evalAsync<Alignment>(mid_block_idx, end_block_idx); });
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index c95c8f2..3320990 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -15,35 +15,6 @@ namespace Eigen { -// Runs an arbitrary function and then calls Notify() on the passed in -// Notification. -template <typename Function, typename... Args> -struct FunctionWrapperWithNotification { - static void run(Notification* n, Function f, Args... args) { - f(args...); - if (n) { - n->Notify(); - } - } -}; - -template <typename Function, typename... Args> -struct FunctionWrapperWithBarrier { - static void run(Barrier* b, Function f, Args... args) { - f(args...); - if (b) { - b->Notify(); - } - } -}; - -template <typename SyncType> -static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { - if (n) { - n->Wait(); - } -} - // An abstract interface to a device specific memory allocator. class Allocator { public: @@ -98,8 +69,9 @@ Barrier barrier(static_cast<int>(num_threads - 1)); // Launch the last 3 blocks on worker threads. for (size_t i = 1; i < num_threads; ++i) { - enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + pool_->Schedule([n, i, src_ptr, dst_ptr, blocksize, &barrier] { ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize))); + barrier.Notify(); }); } // Launch the first block on the main thread. @@ -140,24 +112,22 @@ return 1; } - template <class Function, class... Args> - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { - Notification* n = new Notification(); - pool_->Schedule( - std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, std::forward<Function>(f), args...)); - return n; - } - - template <class Function, class... Args> - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { - pool_->Schedule( - std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b, std::forward<Function>(f), args...)); - } - + // TODO(rmlarsen): Remove this deprecated interface when all users have been converted. template <class Function, class... Args> EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + enqueue(std::forward<Function>(f), std::forward<Args>(args)...); + } + + template <class Function, class... Args> + EIGEN_STRONG_INLINE void enqueue(Function&& f, Args&&... args) const { +#if EIGEN_COMP_CXXVER >= 20 + if constexpr (sizeof...(args) > 0) { + auto run_f = [f = std::forward<Function>(f), ... args = std::forward<Args>(args)]() { f(args...); }; +#else if (sizeof...(args) > 0) { - pool_->Schedule(std::bind(std::forward<Function>(f), args...)); + auto run_f = [f = std::forward<Function>(f), &args...]() { f(args...); }; +#endif + pool_->Schedule(std::move(run_f)); } else { pool_->Schedule(std::forward<Function>(f)); } @@ -191,27 +161,14 @@ // Division code rounds mid to block_size, so we are guaranteed to get // block_count leaves that do actual computations. Barrier barrier(static_cast<unsigned int>(block.count)); - std::function<void(Index, Index)> handleRange; - handleRange = [this, block, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { - while (lastIdx - firstIdx > block.size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; - pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); - lastIdx = midIdx; - } - // Single block or less, execute directly. - f(firstIdx, lastIdx); - barrier.Notify(); - }; - if (block.count <= numThreads()) { // Avoid a thread hop by running the root of the tree and one block on the // main thread. - handleRange(0, n); + handleRange(0, n, block.size, &barrier, pool_, f); } else { // Execute the root in the thread pool to avoid running work on more than // numThreads() threads. - pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + pool_->Schedule([this, n, &block, &barrier, &f]() { handleRange(0, n, block.size, &barrier, pool_, f); }); } barrier.Wait(); @@ -287,6 +244,19 @@ private: typedef TensorCostModel<ThreadPoolDevice> CostModel; + static void handleRange(Index firstIdx, Index lastIdx, Index granularity, Barrier* barrier, ThreadPoolInterface* pool, + const std::function<void(Index, Index)>& f) { + while (lastIdx - firstIdx > granularity) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, granularity) * granularity; + pool->Schedule([=, &f]() { handleRange(midIdx, lastIdx, granularity, barrier, pool, f); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier->Notify(); + } + // For parallelForAsync we must keep passed in closures on the heap, and // delete them only after `done` callback finished. struct ParallelForAsyncContext {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index b1da1a5..da33210 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -352,7 +352,7 @@ TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); evaluator.evalBlock(desc, scratch); } else { - device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block); + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, std::move(eval_block)); } } evaluator.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 62c54e3..9bbf945 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -357,17 +357,6 @@ }; #ifdef EIGEN_USE_THREADS -// Multithreaded full reducers -template <typename Self, typename Op, - bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> -struct FullReducerShard { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer, - typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer); - } -}; - // Multithreaded full reducer template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { @@ -397,8 +386,11 @@ Barrier barrier(internal::convert_index<unsigned int>(numblocks)); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize, - blocksize, reducer, &shards[i]); + auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){ + shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer); + barrier.Notify(); + }; + device.enqueue(std::move(run_shard)); } typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { @@ -888,10 +880,6 @@ friend struct internal::InnerMostDimPreserver; template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer; -#ifdef EIGEN_USE_THREADS - template <typename S, typename O, bool V> - friend struct internal::FullReducerShard; -#endif #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index ce712e2..cf69fef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -448,8 +448,11 @@ eigen_assert(otherIndex == indices[1] / m_fastOtherStride); // Find the offset of the element wrt the location of the first element. - const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, - (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + Index first_entry = (indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth; + Index second_entry = PacketSize == 1 ? first_entry : + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth; + + const Index patchOffsets[2] = {first_entry, second_entry}; const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;