| /* |
| * This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at https://mozilla.org/MPL/2.0/. |
| * |
| * Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen |
| * |
| * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. |
| * |
| * Description: |
| * ------------ |
| * This file implements a high-performance dispatch layer that automatically |
| * routes Eigen's element-wise mathematical operations to AMD Optimizing CPU |
| * Libraries (AOCL) Vector Math Library (VML) functions when beneficial for |
| * performance. |
| * |
| * The dispatch system uses C++ template specialization to intercept Eigen's |
| * assignment operations and redirect them to AOCL's VRDA functions, which |
| * provide optimized implementations for AMD Zen architectures. |
| * |
| * Key Features: |
| * ------------- |
| * 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without |
| * requiring code changes in user applications |
| * |
| * 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen |
| * family processors with automatic SIMD instruction selection (AVX2, AVX-512) |
| * |
| * 3. Threshold-Based Activation: Only activates for vectors larger than |
| * EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on |
| * small vectors |
| * |
| * 4. Precision-Specific Handling: |
| * - Double precision: AOCL VRDA vectorized functions |
| * - Single precision: Scalar fallback (preserves correctness) |
| * |
| * 5. Memory Layout Compatibility: Ensures direct memory access and compatible |
| * storage orders between source and destination for optimal performance |
| * |
| * Supported Operations: |
| * --------------------- |
| * UNARY OPERATIONS (vector → vector): |
| * - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2() |
| * |
| * BINARY OPERATIONS (vector op vector → vector): |
| * - Arithmetic: +, *, pow() |
| * |
| * Template Specialization Mechanism: |
| * ----------------------------------- |
| * The system works by specializing Eigen's Assignment template for: |
| * 1. CwiseUnaryOp with scalar_*_op functors (unary operations) |
| * 2. CwiseBinaryOp with scalar_*_op functors (binary operations) |
| * 3. Dense2Dense assignment context with AOCL-compatible traits |
| * |
| * Dispatch conditions (all must be true): |
| * - Source and destination have DirectAccessBit (contiguous memory) |
| * - Compatible storage orders (both row-major or both column-major) |
| * - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size |
| * - Supported data type (currently double precision for VRDA) |
| * |
| * Integration Example: |
| * -------------------- |
| * // Standard Eigen code - no changes required |
| * VectorXd x = VectorXd::Random(10000); |
| * VectorXd y = VectorXd::Random(10000); |
| * VectorXd result; |
| * |
| * // These operations are automatically dispatched to AOCL: |
| * result = x.array().exp(); // → amd_vrda_exp() |
| * result = x.array().sin(); // → amd_vrda_sin() |
| * result = x.array() + y.array(); // → amd_vrda_add() |
| * result = x.array().pow(y.array()); // → amd_vrda_pow() |
| * |
| * Configuration: |
| * -------------- |
| * Required preprocessor definitions: |
| * - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration |
| * - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch |
| * |
| * Compilation Requirements: |
| * ------------------------- |
| * Include paths: |
| * - AOCL headers: -I${AOCL_ROOT}/include |
| * - Eigen headers: -I/path/to/eigen |
| * |
| * Link libraries: |
| * - AOCL MathLib: -lamdlibm |
| * - Standard math: -lm |
| * |
| * Compiler flags: |
| * - Optimization: -O3 (required for inlining) |
| * - Architecture: -march=znver5 or -march=native |
| * - Vectorization: -mfma -mavx512f (if supported) |
| * |
| * Platform Support: |
| * ------------------ |
| * - Primary: Linux x86_64 with AMD Zen family processors |
| * - Compilers: GCC 8+, Clang 10+, AOCC (recommended) |
| * - AOCL Version: 4.0+ (with VRDA support) |
| * |
| * Error Handling: |
| * --------------- |
| * - Graceful fallback to scalar operations for unsupported configurations |
| * - Compile-time detection of AOCL availability |
| * - Runtime size and alignment validation with eigen_assert() |
| * |
| * Developer: |
| * ---------- |
| * Name: Sharad Saurabh Bhaskar |
| * Email: shbhaska@amd.com |
| * Organization: Advanced Micro Devices, Inc. |
| */ |
| |
| |
| #ifndef EIGEN_ASSIGN_AOCL_H |
| #define EIGEN_ASSIGN_AOCL_H |
| |
| namespace Eigen { |
| namespace internal { |
| |
| // Traits for unary operations. |
| template <typename Dst, typename Src> class aocl_assign_traits { |
| private: |
| enum { |
| DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit), |
| SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit), |
| StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)), |
| InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime) |
| : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime) |
| : int(Dst::RowsAtCompileTime), |
| LargeEnough = |
| (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD) |
| }; |
| |
| public: |
| enum { |
| EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess && |
| StorageOrdersAgree && LargeEnough, |
| Traversal = LinearTraversal |
| }; |
| }; |
| |
| // Traits for binary operations (e.g., add, pow). |
| template <typename Dst, typename Lhs, typename Rhs> |
| class aocl_assign_binary_traits { |
| private: |
| enum { |
| DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit), |
| LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit), |
| RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit), |
| StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) && |
| (int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)), |
| InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime) |
| : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime) |
| : int(Dst::RowsAtCompileTime), |
| LargeEnough = |
| (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD) |
| }; |
| |
| public: |
| enum { |
| EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess && |
| RhsHasDirectAccess && StorageOrdersAgree && LargeEnough |
| }; |
| }; |
| |
| // Unary operation dispatch for float (scalar fallback). |
| #define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP) \ |
| template <typename DstXprType, typename SrcXprNested> \ |
| struct Assignment< \ |
| DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>, \ |
| assign_op<float, float>, Dense2Dense, \ |
| std::enable_if_t< \ |
| aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \ |
| typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested> \ |
| SrcXprType; \ |
| static void run(DstXprType &dst, const SrcXprType &src, \ |
| const assign_op<float, float> &) { \ |
| eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ |
| Eigen::Index n = dst.size(); \ |
| if (n <= 0) \ |
| return; \ |
| const float *input = \ |
| reinterpret_cast<const float *>(src.nestedExpression().data()); \ |
| float *output = reinterpret_cast<float *>(dst.data()); \ |
| for (Eigen::Index i = 0; i < n; ++i) { \ |
| output[i] = std::EIGENOP(input[i]); \ |
| } \ |
| } \ |
| }; |
| |
| // Unary operation dispatch for double (AOCL vectorized). |
| #define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP) \ |
| template <typename DstXprType, typename SrcXprNested> \ |
| struct Assignment< \ |
| DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>, \ |
| assign_op<double, double>, Dense2Dense, \ |
| std::enable_if_t< \ |
| aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \ |
| typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested> \ |
| SrcXprType; \ |
| static void run(DstXprType &dst, const SrcXprType &src, \ |
| const assign_op<double, double> &) { \ |
| eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ |
| Eigen::Index n = dst.size(); \ |
| eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \ |
| if (n <= 0) \ |
| return; \ |
| const double *input = \ |
| reinterpret_cast<const double *>(src.nestedExpression().data()); \ |
| double *output = reinterpret_cast<double *>(dst.data()); \ |
| int aocl_n = internal::convert_index<int>(n); \ |
| AOCLOP(aocl_n, const_cast<double *>(input), output); \ |
| } \ |
| }; |
| |
| // Instantiate unary calls for float (scalar). |
| // EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp) |
| |
| // Instantiate unary calls for double (AOCL vectorized). |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10) |
| EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2) |
| |
| // Binary operation dispatch for float (scalar fallback). |
| #define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC) \ |
| template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \ |
| struct Assignment< \ |
| DstXprType, \ |
| CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \ |
| RhsXprNested>, \ |
| assign_op<float, float>, Dense2Dense, \ |
| std::enable_if_t<aocl_assign_binary_traits< \ |
| DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \ |
| typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \ |
| RhsXprNested> \ |
| SrcXprType; \ |
| static void run(DstXprType &dst, const SrcXprType &src, \ |
| const assign_op<float, float> &) { \ |
| eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ |
| Eigen::Index n = dst.size(); \ |
| if (n <= 0) \ |
| return; \ |
| const float *lhs = reinterpret_cast<const float *>(src.lhs().data()); \ |
| const float *rhs = reinterpret_cast<const float *>(src.rhs().data()); \ |
| float *output = reinterpret_cast<float *>(dst.data()); \ |
| for (Eigen::Index i = 0; i < n; ++i) { \ |
| output[i] = STDFUNC(lhs[i], rhs[i]); \ |
| } \ |
| } \ |
| }; |
| |
| // Binary operation dispatch for double (AOCL vectorized). |
| #define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP) \ |
| template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \ |
| struct Assignment< \ |
| DstXprType, \ |
| CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \ |
| RhsXprNested>, \ |
| assign_op<double, double>, Dense2Dense, \ |
| std::enable_if_t<aocl_assign_binary_traits< \ |
| DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \ |
| typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \ |
| RhsXprNested> \ |
| SrcXprType; \ |
| static void run(DstXprType &dst, const SrcXprType &src, \ |
| const assign_op<double, double> &) { \ |
| eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ |
| Eigen::Index n = dst.size(); \ |
| eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \ |
| if (n <= 0) \ |
| return; \ |
| const double *lhs = reinterpret_cast<const double *>(src.lhs().data()); \ |
| const double *rhs = reinterpret_cast<const double *>(src.rhs().data()); \ |
| double *output = reinterpret_cast<double *>(dst.data()); \ |
| int aocl_n = internal::convert_index<int>(n); \ |
| AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \ |
| } \ |
| }; |
| |
| // Instantiate binary calls for float (scalar). |
| // EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>) // Using |
| // scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow) |
| |
| // Instantiate binary calls for double (AOCL vectorized). |
| EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition |
| EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow) |
| EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax) |
| EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin) |
| |
| } // namespace internal |
| } // namespace Eigen |
| |
| #endif // EIGEN_ASSIGN_AOCL_H |