| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H |
| #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H |
| |
| // IWYU pragma: private |
| #include "./InternalHeaderCheck.h" |
| |
| namespace Eigen { |
| |
| /** \class TensorConvolution |
| * \ingroup CXX11_Tensor_Module |
| * |
| * \brief Tensor convolution class. |
| * |
| * |
| */ |
| namespace internal { |
| |
| template <typename Index, typename InputDims, int NumKernelDims, int Layout> |
| class IndexMapper { |
| public: |
| IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, |
| const array<Index, NumKernelDims>& indices) { |
| array<Index, NumDims> dimensions = input_dims; |
| for (int i = 0; i < NumKernelDims; ++i) { |
| const Index index = indices[i]; |
| const Index input_dim = input_dims[index]; |
| const Index kernel_dim = kernel_dims[i]; |
| const Index result_dim = input_dim - kernel_dim + 1; |
| dimensions[index] = result_dim; |
| } |
| |
| array<Index, NumDims> inputStrides; |
| array<Index, NumDims> outputStrides; |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| inputStrides[0] = 1; |
| outputStrides[0] = 1; |
| for (int i = 1; i < NumDims; ++i) { |
| inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; |
| outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1]; |
| } |
| } else { |
| inputStrides[NumDims - 1] = 1; |
| outputStrides[NumDims - 1] = 1; |
| for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { |
| inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; |
| outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; |
| } |
| } |
| |
| array<Index, NumDims> gpuInputDimensions; |
| array<Index, NumDims> gpuOutputDimensions; |
| array<Index, NumDims> tmp = dimensions; |
| array<Index, NumDims> ordering; |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| for (int i = 0; i < NumKernelDims; ++i) { |
| const Index index = i + offset; |
| ordering[index] = indices[i]; |
| tmp[indices[i]] = -1; |
| gpuInputDimensions[index] = input_dims[indices[i]]; |
| gpuOutputDimensions[index] = dimensions[indices[i]]; |
| } |
| |
| int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0; |
| for (int i = 0; i < NumDims; ++i) { |
| if (tmp[i] >= 0) { |
| ordering[written] = i; |
| gpuInputDimensions[written] = input_dims[i]; |
| gpuOutputDimensions[written] = dimensions[i]; |
| ++written; |
| } |
| } |
| |
| for (int i = 0; i < NumDims; ++i) { |
| m_inputStrides[i] = inputStrides[ordering[i]]; |
| m_outputStrides[i] = outputStrides[ordering[i]]; |
| } |
| |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int i = 0; i < NumDims; ++i) { |
| if (i > NumKernelDims) { |
| m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; |
| m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; |
| } else { |
| m_gpuInputStrides[i] = 1; |
| m_gpuOutputStrides[i] = 1; |
| } |
| } |
| } else { |
| for (int i = NumDims - 1; i >= 0; --i) { |
| if (static_cast<size_t>(i + 1) < offset) { |
| m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; |
| m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; |
| } else { |
| m_gpuInputStrides[i] = 1; |
| m_gpuOutputStrides[i] = 1; |
| } |
| } |
| } |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { |
| Index inputIndex = 0; |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int d = NumDims - 1; d > NumKernelDims; --d) { |
| const Index idx = p / m_gpuInputStrides[d]; |
| inputIndex += idx * m_inputStrides[d]; |
| p -= idx * m_gpuInputStrides[d]; |
| } |
| if (NumKernelDims < NumDims) { |
| inputIndex += p * m_inputStrides[NumKernelDims]; |
| } |
| } else { |
| std::ptrdiff_t limit = 0; |
| if (NumKernelDims < NumDims) { |
| limit = NumDims - NumKernelDims - 1; |
| } |
| for (int d = 0; d < limit; ++d) { |
| const Index idx = p / m_gpuInputStrides[d]; |
| inputIndex += idx * m_inputStrides[d]; |
| p -= idx * m_gpuInputStrides[d]; |
| } |
| inputIndex += p * m_inputStrides[limit]; |
| } |
| return inputIndex; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { |
| Index outputIndex = 0; |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int d = NumDims - 1; d > NumKernelDims; --d) { |
| const Index idx = p / m_gpuOutputStrides[d]; |
| outputIndex += idx * m_outputStrides[d]; |
| p -= idx * m_gpuOutputStrides[d]; |
| } |
| if (NumKernelDims < NumDims) { |
| outputIndex += p * m_outputStrides[NumKernelDims]; |
| } |
| } else { |
| std::ptrdiff_t limit = 0; |
| if (NumKernelDims < NumDims) { |
| limit = NumDims - NumKernelDims - 1; |
| } |
| for (int d = 0; d < limit; ++d) { |
| const Index idx = p / m_gpuOutputStrides[d]; |
| outputIndex += idx * m_outputStrides[d]; |
| p -= idx * m_gpuOutputStrides[d]; |
| } |
| outputIndex += p * m_outputStrides[limit]; |
| } |
| return outputIndex; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_inputStrides[offset]; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_outputStrides[offset]; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2]; |
| } |
| |
| EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { |
| const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims; |
| return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2]; |
| } |
| |
| private: |
| static constexpr int NumDims = internal::array_size<InputDims>::value; |
| array<Index, NumDims> m_inputStrides; |
| array<Index, NumDims> m_outputStrides; |
| array<Index, NumDims> m_gpuInputStrides; |
| array<Index, NumDims> m_gpuOutputStrides; |
| }; |
| |
| template <typename Dimensions, typename InputXprType, typename KernelXprType> |
| struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > { |
| // Type promotion to handle the case where the types of the lhs and the rhs are different. |
| typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar; |
| typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, |
| typename traits<KernelXprType>::StorageKind>::ret StorageKind; |
| typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type |
| Index; |
| typedef typename InputXprType::Nested LhsNested; |
| typedef typename KernelXprType::Nested RhsNested; |
| typedef std::remove_reference_t<LhsNested> LhsNested_; |
| typedef std::remove_reference_t<RhsNested> RhsNested_; |
| static constexpr int NumDimensions = traits<InputXprType>::NumDimensions; |
| static constexpr int Layout = traits<InputXprType>::Layout; |
| typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val, |
| typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType> |
| PointerType; |
| |
| enum { Flags = 0 }; |
| }; |
| |
| template <typename Dimensions, typename InputXprType, typename KernelXprType> |
| struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> { |
| typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; |
| }; |
| |
| template <typename Dimensions, typename InputXprType, typename KernelXprType> |
| struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, |
| typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> { |
| typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; |
| }; |
| |
| } // end namespace internal |
| |
| template <typename Indices, typename InputXprType, typename KernelXprType> |
| class TensorConvolutionOp |
| : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> { |
| public: |
| typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; |
| typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; |
| typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, |
| typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; |
| typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; |
| typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; |
| typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, |
| const Indices& dims) |
| : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; } |
| |
| /** \returns the nested expressions */ |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression() |
| const { |
| return m_input_xpr; |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression() |
| const { |
| return m_kernel_xpr; |
| } |
| |
| protected: |
| typename InputXprType::Nested m_input_xpr; |
| typename KernelXprType::Nested m_kernel_xpr; |
| const Indices m_indices; |
| }; |
| |
| template <typename Indices, typename InputArgType, typename KernelArgType, typename Device> |
| struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> { |
| typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; |
| |
| static constexpr int NumDims = |
| internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; |
| static constexpr int NumKernelDims = internal::array_size<Indices>::value; |
| typedef typename XprType::Index Index; |
| typedef DSizes<Index, NumDims> Dimensions; |
| |
| typedef typename XprType::Scalar Scalar; |
| typedef typename XprType::CoeffReturnType CoeffReturnType; |
| typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; |
| static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size; |
| typedef StorageMemory<Scalar, Device> Storage; |
| typedef typename Storage::Type EvaluatorPointerType; |
| |
| static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout; |
| enum { |
| IsAligned = |
| int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned), |
| PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & |
| int(TensorEvaluator<KernelArgType, Device>::PacketAccess), |
| BlockAccess = false, |
| PreferBlockAccess = false, |
| CoordAccess = false, // to be implemented |
| RawAccess = false |
| }; |
| |
| //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// |
| typedef internal::TensorBlockNotImplemented TensorBlock; |
| //===--------------------------------------------------------------------===// |
| |
| EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) |
| : m_inputImpl(op.inputExpression(), device), |
| m_kernelImpl(op.kernelExpression(), device), |
| m_kernelArg(op.kernelExpression()), |
| m_kernel(NULL), |
| m_local_kernel(false), |
| m_device(device) { |
| EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == |
| static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), |
| YOU_MADE_A_PROGRAMMING_MISTAKE); |
| |
| const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); |
| const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); |
| |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| m_inputStride[0] = 1; |
| for (int i = 1; i < NumDims; ++i) { |
| m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; |
| } |
| } else { |
| m_inputStride[NumDims - 1] = 1; |
| for (int i = NumDims - 2; i >= 0; --i) { |
| m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; |
| } |
| } |
| |
| m_dimensions = m_inputImpl.dimensions(); |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int i = 0; i < NumKernelDims; ++i) { |
| const Index index = op.indices()[i]; |
| const Index input_dim = input_dims[index]; |
| const Index kernel_dim = kernel_dims[i]; |
| const Index result_dim = input_dim - kernel_dim + 1; |
| m_dimensions[index] = result_dim; |
| if (i > 0) { |
| m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; |
| } else { |
| m_kernelStride[0] = 1; |
| } |
| m_indexStride[i] = m_inputStride[index]; |
| } |
| |
| m_outputStride[0] = 1; |
| for (int i = 1; i < NumDims; ++i) { |
| m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; |
| } |
| } else { |
| for (int i = NumKernelDims - 1; i >= 0; --i) { |
| const Index index = op.indices()[i]; |
| const Index input_dim = input_dims[index]; |
| const Index kernel_dim = kernel_dims[i]; |
| const Index result_dim = input_dim - kernel_dim + 1; |
| m_dimensions[index] = result_dim; |
| if (i < NumKernelDims - 1) { |
| m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; |
| } else { |
| m_kernelStride[NumKernelDims - 1] = 1; |
| } |
| m_indexStride[i] = m_inputStride[index]; |
| } |
| |
| m_outputStride[NumDims - 1] = 1; |
| for (int i = NumDims - 2; i >= 0; --i) { |
| m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; |
| } |
| } |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { |
| m_inputImpl.evalSubExprsIfNeeded(NULL); |
| preloadKernel(); |
| return true; |
| } |
| EIGEN_STRONG_INLINE void cleanup() { |
| m_inputImpl.cleanup(); |
| if (m_local_kernel) { |
| m_device.deallocate((void*)m_kernel); |
| m_local_kernel = false; |
| } |
| m_kernel = NULL; |
| } |
| |
| void evalTo(typename XprType::Scalar* buffer) { |
| evalSubExprsIfNeeded(NULL); |
| for (int i = 0; i < dimensions().TotalSize(); ++i) { |
| buffer[i] += coeff(i); |
| } |
| cleanup(); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { |
| CoeffReturnType result = CoeffReturnType(0); |
| convolve(firstInput(index), 0, NumKernelDims - 1, result); |
| return result; |
| } |
| |
| template <int LoadMode> |
| EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { |
| Index indices[2] = {index, index + PacketSize - 1}; |
| Index startInputs[2] = {0, 0}; |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int i = NumDims - 1; i > 0; --i) { |
| const Index idx0 = indices[0] / m_outputStride[i]; |
| const Index idx1 = indices[1] / m_outputStride[i]; |
| startInputs[0] += idx0 * m_inputStride[i]; |
| startInputs[1] += idx1 * m_inputStride[i]; |
| indices[0] -= idx0 * m_outputStride[i]; |
| indices[1] -= idx1 * m_outputStride[i]; |
| } |
| } else { |
| for (int i = 0; i < NumDims - 1; ++i) { |
| const Index idx0 = indices[0] / m_outputStride[i]; |
| const Index idx1 = indices[1] / m_outputStride[i]; |
| startInputs[0] += idx0 * m_inputStride[i]; |
| startInputs[1] += idx1 * m_inputStride[i]; |
| indices[0] -= idx0 * m_outputStride[i]; |
| indices[1] -= idx1 * m_outputStride[i]; |
| } |
| } |
| startInputs[0] += indices[0]; |
| startInputs[1] += indices[1]; |
| |
| if (startInputs[1] - startInputs[0] == PacketSize - 1) { |
| PacketReturnType result = internal::pset1<PacketReturnType>(0); |
| convolvePacket(startInputs[0], 0, NumKernelDims - 1, result); |
| return result; |
| } else { |
| EIGEN_ALIGN_MAX Scalar data[PacketSize]; |
| data[0] = Scalar(0); |
| convolve(startInputs[0], 0, NumKernelDims - 1, data[0]); |
| for (int i = 1; i < PacketSize - 1; ++i) { |
| data[i] = Scalar(0); |
| convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]); |
| } |
| data[PacketSize - 1] = Scalar(0); |
| convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]); |
| return internal::pload<PacketReturnType>(data); |
| } |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { |
| const double kernel_size = m_kernelImpl.dimensions().TotalSize(); |
| // We ignore the use of fused multiply-add. |
| const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); |
| const double firstIndex_compute_cost = |
| NumDims * |
| (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); |
| return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + |
| kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + |
| TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); |
| } |
| |
| EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } |
| |
| private: |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { |
| Index startInput = 0; |
| if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { |
| for (int i = NumDims - 1; i > 0; --i) { |
| const Index idx = index / m_outputStride[i]; |
| startInput += idx * m_inputStride[i]; |
| index -= idx * m_outputStride[i]; |
| } |
| } else { |
| for (int i = 0; i < NumDims - 1; ++i) { |
| const Index idx = index / m_outputStride[i]; |
| startInput += idx * m_inputStride[i]; |
| index -= idx * m_outputStride[i]; |
| } |
| } |
| startInput += index; |
| return startInput; |
| } |
| |
| EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { |
| for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { |
| const Index input = firstIndex + j * m_indexStride[DimIndex]; |
| const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; |
| if (DimIndex > 0) { |
| convolve(input, kernel, DimIndex - 1, accum); |
| } else { |
| accum += m_inputImpl.coeff(input) * m_kernel[kernel]; |
| } |
| } |
| } |
| |
| template <typename Packet> |
| EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { |
| for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { |
| const Index input = firstIndex + j * m_indexStride[DimIndex]; |
| const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; |
| if (DimIndex > 0) { |
| convolvePacket(input, kernel, DimIndex - 1, accum); |
| } else { |
| accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), |
| internal::pset1<Packet>(m_kernel[kernel]), accum); |
| } |
| } |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { |
| // Don't make a local copy of the kernel unless we have to (i.e. it's an |
| // expression that needs to be evaluated) |
| const Scalar* in_place = m_kernelImpl.data(); |
| if (in_place) { |
| m_kernel = in_place; |
| m_local_kernel = false; |
| } else { |
| size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); |
| Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); |
| typedef TensorEvalToOp<const KernelArgType> EvalTo; |
| EvalTo evalToTmp(local, m_kernelArg); |
| const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value; |
| internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device); |
| |
| m_kernel = local; |
| m_local_kernel = true; |
| } |
| } |
| |
| array<Index, NumDims> m_inputStride; |
| array<Index, NumDims> m_outputStride; |
| |
| array<Index, NumKernelDims> m_indexStride; |
| array<Index, NumKernelDims> m_kernelStride; |
| TensorEvaluator<InputArgType, Device> m_inputImpl; |
| TensorEvaluator<KernelArgType, Device> m_kernelImpl; |
| Dimensions m_dimensions; |
| |
| KernelArgType m_kernelArg; |
| const Scalar* m_kernel; |
| bool m_local_kernel; |
| const Device EIGEN_DEVICE_REF m_device; |
| }; |
| |
| // Use an optimized implementation of the evaluation code for GPUs whenever possible. |
| #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) |
| |
| template <int StaticKernelSize> |
| struct GetKernelSize { |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; } |
| }; |
| template <> |
| struct GetKernelSize<Dynamic> { |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; } |
| }; |
| |
| template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> |
| __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( |
| InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper, |
| const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, |
| float* buffer) { |
| #if defined(EIGEN_HIPCC) |
| HIP_DYNAMIC_SHARED(float, s) |
| #else |
| extern __shared__ float s[]; |
| #endif |
| |
| const int first_x = blockIdx.x * maxX; |
| const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; |
| const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); |
| const int num_x_output = last_x - first_x + 1; |
| |
| const int first_plane = blockIdx.y * blockDim.y; |
| const int plane_stride = blockDim.y * gridDim.y; |
| |
| for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { |
| // Load inputs to shared memory |
| const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); |
| const int plane_kernel_offset = threadIdx.y * num_x_input; |
| #pragma unroll |
| for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { |
| const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x); |
| s[i + plane_kernel_offset] = eval.coeff(tensor_index); |
| } |
| |
| __syncthreads(); |
| |
| // Compute the convolution |
| const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); |
| |
| #pragma unroll |
| for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { |
| const int kernel_offset = plane_kernel_offset + i; |
| float result = 0.0f; |
| #pragma unroll |
| for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { |
| result += s[k + kernel_offset] * kernel[k]; |
| } |
| const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x); |
| buffer[tensor_index] = result; |
| } |
| __syncthreads(); |
| } |
| }; |
| |
| template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> |
| __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( |
| InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper, |
| const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, |
| const int kernelSizeX, const int kernelSizeY, float* buffer) { |
| #if defined(EIGEN_HIPCC) |
| HIP_DYNAMIC_SHARED(float, s) |
| #else |
| extern __shared__ float s[]; |
| #endif |
| |
| const int first_x = blockIdx.x * maxX; |
| const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; |
| const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); |
| const int num_x_output = last_x - first_x + 1; |
| |
| const int first_y = blockIdx.y * maxY; |
| const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; |
| const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); |
| const int num_y_output = last_y - first_y + 1; |
| |
| const int first_plane = blockIdx.z * blockDim.z; |
| const int plane_stride = blockDim.z * gridDim.z; |
| |
| for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { |
| const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); |
| const int plane_kernel_offset = threadIdx.z * num_y_input; |
| |
| // Load inputs to shared memory |
| #pragma unroll |
| for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { |
| const int input_offset = num_x_input * (j + plane_kernel_offset); |
| #pragma unroll |
| for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { |
| const int tensor_index = |
| plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y); |
| s[i + input_offset] = eval.coeff(tensor_index); |
| } |
| } |
| |
| __syncthreads(); |
| |
| // Convolution |
| const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); |
| |
| #pragma unroll |
| for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { |
| #pragma unroll |
| for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { |
| float result = 0.0f; |
| #pragma unroll |
| for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { |
| const int kernel_offset = kernelSizeX * l; |
| const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); |
| #pragma unroll |
| for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { |
| result += s[k + input_offset] * kernel[k + kernel_offset]; |
| } |
| } |
| const int tensor_index = |
| plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y); |
| buffer[tensor_index] = result; |
| } |
| } |
| |
| __syncthreads(); |
| } |
| }; |
| |
| template <typename InputEvaluator, typename Index, typename InputDims> |
| __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( |
| InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper, |
| const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, |
| const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, |
| const size_t kernelSizeZ, float* buffer) { |
| #if defined(EIGEN_HIPCC) |
| HIP_DYNAMIC_SHARED(float, s) |
| #else |
| extern __shared__ float s[]; |
| #endif |
| |
| // Load inputs to shared memory |
| const int first_x = blockIdx.x * maxX; |
| const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; |
| const int num_x_input = last_x - first_x + kernelSizeX; |
| |
| const int first_y = blockIdx.y * maxY; |
| const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; |
| const int num_y_input = last_y - first_y + kernelSizeY; |
| |
| const int first_z = blockIdx.z * maxZ; |
| const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; |
| const int num_z_input = last_z - first_z + kernelSizeZ; |
| |
| for (int p = 0; p < numPlanes; ++p) { |
| const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); |
| const int plane_kernel_offset = 0; |
| |
| for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { |
| for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { |
| for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { |
| const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( |
| i + first_x, j + first_y, k + first_z); |
| s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); |
| } |
| } |
| } |
| |
| __syncthreads(); |
| |
| // Convolution |
| const int num_z_output = last_z - first_z + 1; |
| const int num_y_output = last_y - first_y + 1; |
| const int num_x_output = last_x - first_x + 1; |
| const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); |
| |
| for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { |
| for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { |
| for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { |
| float result = 0.0f; |
| for (int n = 0; n < kernelSizeZ; ++n) { |
| for (int m = 0; m < kernelSizeY; ++m) { |
| for (int l = 0; l < kernelSizeX; ++l) { |
| result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * |
| kernel[l + kernelSizeX * (m + kernelSizeY * n)]; |
| } |
| } |
| } |
| const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset( |
| i + first_x, j + first_y, k + first_z); |
| buffer[tensor_index] = result; |
| } |
| } |
| } |
| __syncthreads(); |
| } |
| }; |
| |
| template <typename Indices, typename InputArgType, typename KernelArgType> |
| struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> { |
| typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; |
| |
| static constexpr int NumDims = |
| internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; |
| static constexpr int NumKernelDims = internal::array_size<Indices>::value; |
| typedef typename XprType::Index Index; |
| typedef DSizes<Index, NumDims> Dimensions; |
| typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; |
| |
| static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout; |
| enum { |
| IsAligned = |
| TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, |
| PacketAccess = false, |
| BlockAccess = false, |
| PreferBlockAccess = false, |
| CoordAccess = false, // to be implemented |
| RawAccess = false |
| }; |
| |
| //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// |
| typedef internal::TensorBlockNotImplemented TensorBlock; |
| //===--------------------------------------------------------------------===// |
| |
| TensorEvaluator(const XprType& op, const GpuDevice& device) |
| : m_inputImpl(op.inputExpression(), device), |
| m_kernelImpl(op.kernelExpression(), device), |
| m_kernelArg(op.kernelExpression()), |
| m_indices(op.indices()), |
| m_buf(NULL), |
| m_kernel(NULL), |
| m_local_kernel(false), |
| m_device(device) { |
| EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == |
| static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), |
| YOU_MADE_A_PROGRAMMING_MISTAKE); |
| |
| const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); |
| const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); |
| |
| m_dimensions = m_inputImpl.dimensions(); |
| for (int i = 0; i < NumKernelDims; ++i) { |
| const Index index = op.indices()[i]; |
| const Index input_dim = input_dims[index]; |
| const Index kernel_dim = kernel_dims[i]; |
| const Index result_dim = input_dim - kernel_dim + 1; |
| m_dimensions[index] = result_dim; |
| } |
| } |
| |
| typedef typename XprType::CoeffReturnType CoeffReturnType; |
| typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; |
| typedef typename InputArgType::Scalar Scalar; |
| static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size; |
| |
| EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { |
| preloadKernel(); |
| m_inputImpl.evalSubExprsIfNeeded(NULL); |
| if (data) { |
| executeEval(data); |
| return false; |
| } else { |
| m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); |
| executeEval(m_buf); |
| return true; |
| } |
| } |
| |
| EIGEN_STRONG_INLINE void cleanup() { |
| m_inputImpl.cleanup(); |
| if (m_buf) { |
| m_device.deallocate(m_buf); |
| m_buf = NULL; |
| } |
| if (m_local_kernel) { |
| m_device.deallocate((void*)m_kernel); |
| m_local_kernel = false; |
| } |
| m_kernel = NULL; |
| } |
| |
| EIGEN_STRONG_INLINE void preloadKernel() { |
| // Don't make a local copy of the kernel unless we have to (i.e. it's an |
| // expression that needs to be evaluated) |
| const Scalar* in_place = m_kernelImpl.data(); |
| if (in_place) { |
| m_kernel = in_place; |
| m_local_kernel = false; |
| } else { |
| size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); |
| Scalar* local = (Scalar*)m_device.allocate(kernel_sz); |
| typedef TensorEvalToOp<const KernelArgType> EvalTo; |
| EvalTo evalToTmp(local, m_kernelArg); |
| const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; |
| internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); |
| |
| m_kernel = local; |
| m_local_kernel = true; |
| } |
| } |
| |
| static unsigned int ceil(unsigned int num, unsigned int denom) { |
| const unsigned int rounded_toward_zero = num / denom; |
| if (num > rounded_toward_zero * denom) { |
| return rounded_toward_zero + 1; |
| } |
| return rounded_toward_zero; |
| } |
| |
| void executeEval(Scalar* data) const { |
| typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; |
| |
| const int maxSharedMem = m_device.sharedMemPerBlock(); |
| const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); |
| const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; |
| const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); |
| const int warpSize = 32; |
| |
| switch (NumKernelDims) { |
| case 1: { |
| const int kernel_size = m_kernelImpl.dimensions().TotalSize(); |
| |
| const int numX = dimensions()[m_indices[0]]; |
| const int numP = dimensions().TotalSize() / numX; |
| int maxX; |
| dim3 block_size; |
| |
| const int single_stride_dim = |
| static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1; |
| if (m_indices[0] == single_stride_dim) { |
| // Maximum the reuse |
| const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; |
| maxX = numext::mini<int>(inner_dim, numX); |
| const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); |
| block_size.x = numext::mini(maxThreadsPerBlock, maxX); |
| block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); |
| } else { |
| // Read as much as possible alongside the inner most dimension, that is the plane |
| const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); |
| const int maxP = numext::mini<int>(inner_dim, numP); |
| maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); |
| |
| block_size.x = numext::mini(warpSize, maxX); |
| block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); |
| } |
| |
| const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); |
| gpu_assert(shared_mem <= maxSharedMem); |
| |
| const int num_x_blocks = ceil(numX, maxX); |
| const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); |
| const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); |
| |
| dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); |
| |
| // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " |
| // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " |
| // << shared_mem << " in stream " << m_device.stream() << endl; |
| |
| const array<Index, 1> indices{m_indices[0]}; |
| const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]}; |
| internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); |
| switch (kernel_size) { |
| case 4: { |
| LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, |
| numX, maxX, 4, data); |
| break; |
| } |
| case 7: { |
| LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, |
| numX, maxX, 7, data); |
| break; |
| } |
| default: { |
| LAUNCH_GPU_KERNEL( |
| (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, |
| kernel_size, data); |
| } |
| } |
| break; |
| } |
| |
| case 2: { |
| const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; |
| const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; |
| const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; |
| const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; |
| |
| const int numX = dimensions()[m_indices[idxX]]; |
| const int numY = dimensions()[m_indices[idxY]]; |
| const int numP = dimensions().TotalSize() / (numX * numY); |
| |
| const float scaling_factor = |
| sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); |
| |
| // Snap maxX to warp size |
| int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; |
| const int maxX = numext::mini<int>(inner_dim, numX); |
| const int maxY = |
| numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); |
| const int maxP = numext::mini<int>( |
| maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); |
| |
| dim3 block_size; |
| block_size.x = numext::mini(1024, maxX); |
| block_size.y = numext::mini<int>(1024 / block_size.x, maxY); |
| block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP); |
| |
| const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); |
| gpu_assert(shared_mem <= maxSharedMem); |
| |
| const int num_x_blocks = ceil(numX, maxX); |
| const int num_y_blocks = ceil(numY, maxY); |
| const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); |
| const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); |
| |
| dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); |
| |
| // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " |
| // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << |
| // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " |
| // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; |
| |
| const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]}; |
| const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}; |
| internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); |
| switch (kernel_size_x) { |
| case 4: { |
| switch (kernel_size_y) { |
| case 7: { |
| LAUNCH_GPU_KERNEL( |
| (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, |
| numY, maxY, 4, 7, data); |
| break; |
| } |
| default: { |
| LAUNCH_GPU_KERNEL( |
| (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, |
| numY, maxY, 4, kernel_size_y, data); |
| break; |
| } |
| } |
| break; |
| } |
| case 7: { |
| switch (kernel_size_y) { |
| case 4: { |
| LAUNCH_GPU_KERNEL( |
| (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, |
| numY, maxY, 7, 4, data); |
| break; |
| } |
| default: { |
| LAUNCH_GPU_KERNEL( |
| (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, |
| numY, maxY, 7, kernel_size_y, data); |
| break; |
| } |
| } |
| break; |
| } |
| default: { |
| LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, |
| Dynamic, Dynamic>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, |
| numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); |
| break; |
| } |
| } |
| break; |
| } |
| |
| case 3: { |
| const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; |
| const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; |
| const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; |
| |
| const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; |
| const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; |
| const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; |
| |
| const int numX = dimensions()[m_indices[idxX]]; |
| const int numY = dimensions()[m_indices[idxY]]; |
| const int numZ = dimensions()[m_indices[idxZ]]; |
| const int numP = dimensions().TotalSize() / (numX * numY * numZ); |
| |
| const int maxX = numext::mini<int>( |
| 128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, |
| numX)); |
| const int maxY = numext::mini<int>( |
| 128, numext::mini<int>( |
| maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, |
| numY)); |
| const int maxZ = numext::mini<int>( |
| 128, numext::mini<int>( |
| maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - |
| kernel_size_z + 1, |
| numZ)); |
| |
| dim3 block_size; |
| block_size.x = numext::mini(32, maxX); |
| block_size.y = numext::mini(32, maxY); |
| block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ); |
| dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); |
| |
| const int shared_mem = |
| (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); |
| gpu_assert(shared_mem <= maxSharedMem); |
| |
| // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " |
| // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << |
| // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << |
| // endl; |
| const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}; |
| const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY], |
| m_kernelImpl.dimensions()[idxZ]}; |
| internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); |
| |
| LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), |
| num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, |
| maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); |
| break; |
| } |
| |
| default: { |
| EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), |
| THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); |
| } |
| } |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { |
| eigen_assert(m_buf); |
| eigen_assert(index < m_dimensions.TotalSize()); |
| return m_buf[index]; |
| } |
| |
| template <int LoadMode> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { |
| eigen_assert(m_buf); |
| eigen_assert(index < m_dimensions.TotalSize()); |
| return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { |
| // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost |
| // model. |
| const double kernel_size = m_kernelImpl.dimensions().TotalSize(); |
| // We ignore the use of fused multiply-add. |
| const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); |
| const double firstIndex_compute_cost = |
| NumDims * |
| (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); |
| return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + |
| kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + |
| TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); |
| } |
| |
| private: |
| TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; |
| TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; |
| KernelArgType m_kernelArg; |
| Indices m_indices; |
| Dimensions m_dimensions; |
| Scalar* m_buf; |
| const Scalar* m_kernel; |
| bool m_local_kernel; |
| |
| const GpuDevice& m_device; |
| }; |
| #endif |
| |
| } // end namespace Eigen |
| |
| #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H |