Update Eigen to commit:464c1d097891a1462ab28bf8bb763c1683883892
CHANGELOG
=========
464c1d097 - Format TensorDeviceThreadPool.h & use if constexpr for c++20.
21223f6bb - Fix addition of different enum types.
350544eb0 - Clean up TensorDeviceThreadPool.h
43810fc1b - Fix extra semicolon in DeviceWrapper
d28041ed5 - refactor AssignmentFunctors.h, unify with existing scalar_op
9a8621403 - Optimize division operations in TensorVolumePatch.h
be5147b09 - Fix STL feature detection for c++20.
179a49684 - Fix CMake BOOST warning
PiperOrigin-RevId: 735405240
Change-Id: Ie55bcb8e50a5323c276230c5a89ac630c547bfb1
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
index 3ae8256..75fc8e7 100644
--- a/Eigen/src/Core/DeviceWrapper.h
+++ b/Eigen/src/Core/DeviceWrapper.h
@@ -66,7 +66,7 @@
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
Device&) {
Base::run(dst, src, func);
- };
+ }
};
// specialization for coeffcient-wise assignment
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index 25d4575..4576cc0 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -325,7 +325,7 @@
public:
typedef Index difference_type;
typedef typename XprType::Scalar value_type;
-#if EIGEN_CPLUSPLUS >= 202002L
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L
typedef std::conditional_t<XprType::InnerStrideAtCompileTime == 1, std::contiguous_iterator_tag,
std::random_access_iterator_tag>
iterator_category;
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 3687bb2..7edcc60 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -27,7 +27,7 @@
template <int Alignment, typename Packet>
EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
- internal::pstoret<DstScalar, Packet, Alignment>(a, b);
+ pstoret<DstScalar, Packet, Alignment>(a, b);
}
};
@@ -36,7 +36,7 @@
struct assign_op<DstScalar, void> {};
template <typename DstScalar, typename SrcScalar>
-struct functor_traits<assign_op<DstScalar, SrcScalar> > {
+struct functor_traits<assign_op<DstScalar, SrcScalar>> {
enum {
Cost = NumTraits<DstScalar>::ReadCost,
PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::Vectorizable &&
@@ -45,88 +45,70 @@
};
/** \internal
- * \brief Template functor for scalar/packet assignment with addition
+ * \brief Template functor for scalar/packet compound assignment
*
*/
-template <typename DstScalar, typename SrcScalar>
-struct add_assign_op {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct compound_assign_op {
+ using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
+ assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b));
+ }
template <int Alignment, typename Packet>
EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
- internal::pstoret<DstScalar, Packet, Alignment>(a, internal::padd(internal::ploadt<Packet, Alignment>(a), b));
+ assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
+ a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
}
};
-template <typename DstScalar, typename SrcScalar>
-struct functor_traits<add_assign_op<DstScalar, SrcScalar> > {
+
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>> {
enum {
- Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
- PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasAdd
+ Cost = int(functor_traits<assign_op<DstScalar, DstScalar>>::Cost) + int(functor_traits<Func>::Cost),
+ PacketAccess = functor_traits<assign_op<DstScalar, DstScalar>>::PacketAccess && functor_traits<Func>::PacketAccess
};
};
/** \internal
+ * \brief Template functor for scalar/packet assignment with addition
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct add_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_sum_op<DstScalar, SrcScalar>> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar, SrcScalar>> : add_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
* \brief Template functor for scalar/packet assignment with subtraction
*
*/
-template <typename DstScalar, typename SrcScalar>
-struct sub_assign_op {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct sub_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_difference_op<DstScalar, SrcScalar>> {};
- template <int Alignment, typename Packet>
- EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
- internal::pstoret<DstScalar, Packet, Alignment>(a, internal::psub(internal::ploadt<Packet, Alignment>(a), b));
- }
-};
template <typename DstScalar, typename SrcScalar>
-struct functor_traits<sub_assign_op<DstScalar, SrcScalar> > {
- enum {
- Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
- PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasSub
- };
-};
+struct functor_traits<sub_assign_op<DstScalar, SrcScalar>> : sub_assign_op<DstScalar, SrcScalar>::traits {};
/** \internal
* \brief Template functor for scalar/packet assignment with multiplication
*
*/
template <typename DstScalar, typename SrcScalar = DstScalar>
-struct mul_assign_op {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
+struct mul_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_product_op<DstScalar, SrcScalar>> {};
- template <int Alignment, typename Packet>
- EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
- internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pmul(internal::ploadt<Packet, Alignment>(a), b));
- }
-};
template <typename DstScalar, typename SrcScalar>
-struct functor_traits<mul_assign_op<DstScalar, SrcScalar> > {
- enum {
- Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
- PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasMul
- };
-};
+struct functor_traits<mul_assign_op<DstScalar, SrcScalar>> : mul_assign_op<DstScalar, SrcScalar>::traits {};
/** \internal
- * \brief Template functor for scalar/packet assignment with diviving
+ * \brief Template functor for scalar/packet assignment with dividing
*
*/
template <typename DstScalar, typename SrcScalar = DstScalar>
-struct div_assign_op {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
+struct div_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_quotient_op<DstScalar, SrcScalar>> {};
- template <int Alignment, typename Packet>
- EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
- internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pdiv(internal::ploadt<Packet, Alignment>(a), b));
- }
-};
template <typename DstScalar, typename SrcScalar>
-struct functor_traits<div_assign_op<DstScalar, SrcScalar> > {
- enum {
- Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
- PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasDiv
- };
-};
+struct functor_traits<div_assign_op<DstScalar, SrcScalar>> : div_assign_op<DstScalar, SrcScalar>::traits {};
/** \internal
* \brief Template functor for scalar/packet assignment with swapping
@@ -158,7 +140,7 @@
}
};
template <typename Scalar>
-struct functor_traits<swap_assign_op<Scalar> > {
+struct functor_traits<swap_assign_op<Scalar>> {
enum {
Cost = 3 * NumTraits<Scalar>::ReadCost,
PacketAccess =
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 97cf4f3..0facd26 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -1316,7 +1316,8 @@
* This wraps C++20's std::construct_at, using placement new instead if it is not available.
*/
-#if EIGEN_COMP_CXXVER >= 20
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_constexpr_dynamic_alloc) && \
+ __cpp_lib_constexpr_dynamic_alloc >= 201907L
using std::construct_at;
#else
template <class T, class... Args>
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 39a117e..40604f8 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -220,7 +220,7 @@
*
* Post C++17: Uses std::void_t
*/
-#if EIGEN_COMP_CXXVER >= 17
+#if EIGEN_COMP_CXXVER >= 17 && defined(__cpp_lib_void_t) && __cpp_lib_void_t >= 201411L
using std::void_t;
#else
template <typename...>
@@ -338,7 +338,16 @@
*
* For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
*/
-#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_STRICT_LESS_THAN(10, 0, 0)
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L
+
+template <typename T>
+EIGEN_CONSTEXPR auto index_list_size(T&& x) {
+ using std::ssize;
+ return ssize(std::forward<T>(x));
+}
+
+#else
+
template <typename T>
EIGEN_CONSTEXPR auto index_list_size(const T& x) {
using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
@@ -349,13 +358,7 @@
EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) {
return N;
}
-#else
-template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(T&& x) {
- using std::ssize;
- return ssize(std::forward<T>(x));
-}
-#endif // EIGEN_COMP_CXXVER
+#endif
/** \internal
* Convenient struct to get the result type of a nullary, unary, binary, or
@@ -745,7 +748,7 @@
inline constexpr bool check_implication(bool a, bool b) { return !a || b; }
/// \internal Provide fallback for std::is_constant_evaluated for pre-C++20.
-#if EIGEN_COMP_CXXVER >= 20
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
using std::is_constant_evaluated;
#else
constexpr bool is_constant_evaluated() { return false; }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3a67ab1..813cc53 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -384,7 +384,7 @@
endif()
# boost MP unit test
-find_package(Boost 1.53.0)
+find_package(Boost 1.53.0 CONFIG)
if(Boost_FOUND)
include_directories(${Boost_INCLUDE_DIRS})
ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 288d79f..99e7304 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -932,7 +932,7 @@
kernel(m, n, k, use_thread_local);
} else {
eigen_assert(!use_thread_local);
- device_.enqueueNoNotification([this, m, n, k, use_thread_local]() {
+ device_.enqueue([this, m, n, k, use_thread_local]() {
kernel(m, n, k, use_thread_local);
});
}
@@ -982,7 +982,7 @@
} else {
while (end - start > 1) {
Index mid = (start + end) / 2;
- device_.enqueueNoNotification([this, mid, end, k, rhs]() {
+ device_.enqueue([this, mid, end, k, rhs]() {
enqueue_packing_helper(mid, end, k, rhs);
});
end = mid;
@@ -1000,7 +1000,7 @@
(k > 0 || std::this_thread::get_id() == created_by_thread_id_);
if (pack_async) {
- device_.enqueueNoNotification([this, start, end, k, rhs]() {
+ device_.enqueue([this, start, end, k, rhs]() {
enqueue_packing_helper(start, end, k, rhs);
});
} else {
@@ -1264,7 +1264,7 @@
void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
while (end_block_idx - start_block_idx > 1) {
Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
- evaluator->m_device.enqueueNoNotification([this, &barrier, mid_block_idx, end_block_idx]() {
+ evaluator->m_device.enqueue([this, &barrier, mid_block_idx, end_block_idx]() {
eval<Alignment>(barrier, mid_block_idx, end_block_idx);
});
end_block_idx = mid_block_idx;
@@ -1282,7 +1282,7 @@
void evalAsync(Index start_block_idx, Index end_block_idx) {
while (end_block_idx - start_block_idx > 1) {
Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
- evaluator->m_device.enqueueNoNotification(
+ evaluator->m_device.enqueue(
[this, mid_block_idx, end_block_idx]() {
evalAsync<Alignment>(mid_block_idx, end_block_idx);
});
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index c95c8f2..3320990 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -15,35 +15,6 @@
namespace Eigen {
-// Runs an arbitrary function and then calls Notify() on the passed in
-// Notification.
-template <typename Function, typename... Args>
-struct FunctionWrapperWithNotification {
- static void run(Notification* n, Function f, Args... args) {
- f(args...);
- if (n) {
- n->Notify();
- }
- }
-};
-
-template <typename Function, typename... Args>
-struct FunctionWrapperWithBarrier {
- static void run(Barrier* b, Function f, Args... args) {
- f(args...);
- if (b) {
- b->Notify();
- }
- }
-};
-
-template <typename SyncType>
-static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
- if (n) {
- n->Wait();
- }
-}
-
// An abstract interface to a device specific memory allocator.
class Allocator {
public:
@@ -98,8 +69,9 @@
Barrier barrier(static_cast<int>(num_threads - 1));
// Launch the last 3 blocks on worker threads.
for (size_t i = 1; i < num_threads; ++i) {
- enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+ pool_->Schedule([n, i, src_ptr, dst_ptr, blocksize, &barrier] {
::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize)));
+ barrier.Notify();
});
}
// Launch the first block on the main thread.
@@ -140,24 +112,22 @@
return 1;
}
- template <class Function, class... Args>
- EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
- Notification* n = new Notification();
- pool_->Schedule(
- std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, std::forward<Function>(f), args...));
- return n;
- }
-
- template <class Function, class... Args>
- EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const {
- pool_->Schedule(
- std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b, std::forward<Function>(f), args...));
- }
-
+ // TODO(rmlarsen): Remove this deprecated interface when all users have been converted.
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+ enqueue(std::forward<Function>(f), std::forward<Args>(args)...);
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueue(Function&& f, Args&&... args) const {
+#if EIGEN_COMP_CXXVER >= 20
+ if constexpr (sizeof...(args) > 0) {
+ auto run_f = [f = std::forward<Function>(f), ... args = std::forward<Args>(args)]() { f(args...); };
+#else
if (sizeof...(args) > 0) {
- pool_->Schedule(std::bind(std::forward<Function>(f), args...));
+ auto run_f = [f = std::forward<Function>(f), &args...]() { f(args...); };
+#endif
+ pool_->Schedule(std::move(run_f));
} else {
pool_->Schedule(std::forward<Function>(f));
}
@@ -191,27 +161,14 @@
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
Barrier barrier(static_cast<unsigned int>(block.count));
- std::function<void(Index, Index)> handleRange;
- handleRange = [this, block, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) {
- while (lastIdx - firstIdx > block.size) {
- // Split into halves and schedule the second half on a different thread.
- const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
- pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
- lastIdx = midIdx;
- }
- // Single block or less, execute directly.
- f(firstIdx, lastIdx);
- barrier.Notify();
- };
-
if (block.count <= numThreads()) {
// Avoid a thread hop by running the root of the tree and one block on the
// main thread.
- handleRange(0, n);
+ handleRange(0, n, block.size, &barrier, pool_, f);
} else {
// Execute the root in the thread pool to avoid running work on more than
// numThreads() threads.
- pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+ pool_->Schedule([this, n, &block, &barrier, &f]() { handleRange(0, n, block.size, &barrier, pool_, f); });
}
barrier.Wait();
@@ -287,6 +244,19 @@
private:
typedef TensorCostModel<ThreadPoolDevice> CostModel;
+ static void handleRange(Index firstIdx, Index lastIdx, Index granularity, Barrier* barrier, ThreadPoolInterface* pool,
+ const std::function<void(Index, Index)>& f) {
+ while (lastIdx - firstIdx > granularity) {
+ // Split into halves and schedule the second half on a different thread.
+ const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, granularity) * granularity;
+ pool->Schedule([=, &f]() { handleRange(midIdx, lastIdx, granularity, barrier, pool, f); });
+ lastIdx = midIdx;
+ }
+ // Single block or less, execute directly.
+ f(firstIdx, lastIdx);
+ barrier->Notify();
+ }
+
// For parallelForAsync we must keep passed in closures on the heap, and
// delete them only after `done` callback finished.
struct ParallelForAsyncContext {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b1da1a5..da33210 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -352,7 +352,7 @@
TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
evaluator.evalBlock(desc, scratch);
} else {
- device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block);
+ device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, std::move(eval_block));
}
}
evaluator.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 62c54e3..9bbf945 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -357,17 +357,6 @@
};
#ifdef EIGEN_USE_THREADS
-// Multithreaded full reducers
-template <typename Self, typename Op,
- bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
-struct FullReducerShard {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
- typename Self::Index numValuesToReduce, Op& reducer,
- typename Self::CoeffReturnType* output) {
- *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer);
- }
-};
-
// Multithreaded full reducer
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
@@ -397,8 +386,11 @@
Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize,
- blocksize, reducer, &shards[i]);
+ auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
+ shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
+ barrier.Notify();
+ };
+ device.enqueue(std::move(run_shard));
}
typename Self::CoeffReturnType finalShard;
if (numblocks * blocksize < num_coeffs) {
@@ -888,10 +880,6 @@
friend struct internal::InnerMostDimPreserver;
template <typename S, typename O, typename D, bool V>
friend struct internal::FullReducer;
-#ifdef EIGEN_USE_THREADS
- template <typename S, typename O, bool V>
- friend struct internal::FullReducerShard;
-#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index ce712e2..cf69fef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -448,8 +448,11 @@
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
// Find the offset of the element wrt the location of the first element.
- const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
- (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+ Index first_entry = (indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth;
+ Index second_entry = PacketSize == 1 ? first_entry :
+ (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+ const Index patchOffsets[2] = {first_entry, second_entry};
const Index patch3DIndex =
(NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;