Update Eigen to commit:75bcd155c40cb48e647c87c3f29052360255bc9e
CHANGELOG
=========
75bcd155c - Vectorize tan(x)
01a919d13 - Fix AOCL cmake issues.
a73501cc7 - Added versioning for shared libraries.
db90c4939 - Add a ptanh_float implementation that is accurate to 1 ULP
48eb5227c - Add BLAS function axpby.
a1eeb0220 - Expand CMake compatibility range for single-version specifications.
8a1083e9b - Aocl integration updated
a6630c53c - Fix bug introduced in !2030
PiperOrigin-RevId: 839836117
Change-Id: I1dd1114e3a79200e9506a83c6c4db964d48deac2
diff --git a/Eigen/Core b/Eigen/Core
index 94fd6ec..9f81658 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -53,6 +53,8 @@
// this include file manages BLAS and MKL related macros
// and inclusion of their respective header files
#include "src/Core/util/MKL_support.h"
+#include "src/Core/util/AOCL_Support.h" // ← ADD THIS
+
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#define EIGEN_HAS_GPU_FP16
@@ -463,6 +465,10 @@
#include "src/Core/Assign_MKL.h"
#endif
+#ifdef EIGEN_USE_AOCL_VML
+#include "src/Core/Assign_AOCL.h"
+#endif
+
#include "src/Core/GlobalFunctions.h"
// IWYU pragma: end_exports
diff --git a/Eigen/src/Core/Assign_AOCL.h b/Eigen/src/Core/Assign_AOCL.h
new file mode 100644
index 0000000..da3ef7c
--- /dev/null
+++ b/Eigen/src/Core/Assign_AOCL.h
@@ -0,0 +1,301 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This file implements a high-performance dispatch layer that automatically
+ * routes Eigen's element-wise mathematical operations to AMD Optimizing CPU
+ * Libraries (AOCL) Vector Math Library (VML) functions when beneficial for
+ * performance.
+ *
+ * The dispatch system uses C++ template specialization to intercept Eigen's
+ * assignment operations and redirect them to AOCL's VRDA functions, which
+ * provide optimized implementations for AMD Zen architectures.
+ *
+ * Key Features:
+ * -------------
+ * 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without
+ * requiring code changes in user applications
+ *
+ * 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen
+ * family processors with automatic SIMD instruction selection (AVX2, AVX-512)
+ *
+ * 3. Threshold-Based Activation: Only activates for vectors larger than
+ * EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on
+ * small vectors
+ *
+ * 4. Precision-Specific Handling:
+ * - Double precision: AOCL VRDA vectorized functions
+ * - Single precision: Scalar fallback (preserves correctness)
+ *
+ * 5. Memory Layout Compatibility: Ensures direct memory access and compatible
+ * storage orders between source and destination for optimal performance
+ *
+ * Supported Operations:
+ * ---------------------
+ * UNARY OPERATIONS (vector → vector):
+ * - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2()
+ *
+ * BINARY OPERATIONS (vector op vector → vector):
+ * - Arithmetic: +, *, pow()
+ *
+ * Template Specialization Mechanism:
+ * -----------------------------------
+ * The system works by specializing Eigen's Assignment template for:
+ * 1. CwiseUnaryOp with scalar_*_op functors (unary operations)
+ * 2. CwiseBinaryOp with scalar_*_op functors (binary operations)
+ * 3. Dense2Dense assignment context with AOCL-compatible traits
+ *
+ * Dispatch conditions (all must be true):
+ * - Source and destination have DirectAccessBit (contiguous memory)
+ * - Compatible storage orders (both row-major or both column-major)
+ * - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size
+ * - Supported data type (currently double precision for VRDA)
+ *
+ * Integration Example:
+ * --------------------
+ * // Standard Eigen code - no changes required
+ * VectorXd x = VectorXd::Random(10000);
+ * VectorXd y = VectorXd::Random(10000);
+ * VectorXd result;
+ *
+ * // These operations are automatically dispatched to AOCL:
+ * result = x.array().exp(); // → amd_vrda_exp()
+ * result = x.array().sin(); // → amd_vrda_sin()
+ * result = x.array() + y.array(); // → amd_vrda_add()
+ * result = x.array().pow(y.array()); // → amd_vrda_pow()
+ *
+ * Configuration:
+ * --------------
+ * Required preprocessor definitions:
+ * - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration
+ * - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch
+ *
+ * Compilation Requirements:
+ * -------------------------
+ * Include paths:
+ * - AOCL headers: -I${AOCL_ROOT}/include
+ * - Eigen headers: -I/path/to/eigen
+ *
+ * Link libraries:
+ * - AOCL MathLib: -lamdlibm
+ * - Standard math: -lm
+ *
+ * Compiler flags:
+ * - Optimization: -O3 (required for inlining)
+ * - Architecture: -march=znver5 or -march=native
+ * - Vectorization: -mfma -mavx512f (if supported)
+ *
+ * Platform Support:
+ * ------------------
+ * - Primary: Linux x86_64 with AMD Zen family processors
+ * - Compilers: GCC 8+, Clang 10+, AOCC (recommended)
+ * - AOCL Version: 4.0+ (with VRDA support)
+ *
+ * Error Handling:
+ * ---------------
+ * - Graceful fallback to scalar operations for unsupported configurations
+ * - Compile-time detection of AOCL availability
+ * - Runtime size and alignment validation with eigen_assert()
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+
+#ifndef EIGEN_ASSIGN_AOCL_H
+#define EIGEN_ASSIGN_AOCL_H
+
+namespace Eigen {
+namespace internal {
+
+// Traits for unary operations.
+template <typename Dst, typename Src> class aocl_assign_traits {
+private:
+ enum {
+ DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+ SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit),
+ StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+ InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
+ : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+ : int(Dst::RowsAtCompileTime),
+ LargeEnough =
+ (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+ };
+
+public:
+ enum {
+ EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess &&
+ StorageOrdersAgree && LargeEnough,
+ Traversal = LinearTraversal
+ };
+};
+
+// Traits for binary operations (e.g., add, pow).
+template <typename Dst, typename Lhs, typename Rhs>
+class aocl_assign_binary_traits {
+private:
+ enum {
+ DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+ LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit),
+ RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit),
+ StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) &&
+ (int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)),
+ InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
+ : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+ : int(Dst::RowsAtCompileTime),
+ LargeEnough =
+ (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+ };
+
+public:
+ enum {
+ EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess &&
+ RhsHasDirectAccess && StorageOrdersAgree && LargeEnough
+ };
+};
+
+// Unary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP) \
+ template <typename DstXprType, typename SrcXprNested> \
+ struct Assignment< \
+ DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>, \
+ assign_op<float, float>, Dense2Dense, \
+ std::enable_if_t< \
+ aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
+ typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested> \
+ SrcXprType; \
+ static void run(DstXprType &dst, const SrcXprType &src, \
+ const assign_op<float, float> &) { \
+ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
+ Eigen::Index n = dst.size(); \
+ if (n <= 0) \
+ return; \
+ const float *input = \
+ reinterpret_cast<const float *>(src.nestedExpression().data()); \
+ float *output = reinterpret_cast<float *>(dst.data()); \
+ for (Eigen::Index i = 0; i < n; ++i) { \
+ output[i] = std::EIGENOP(input[i]); \
+ } \
+ } \
+ };
+
+// Unary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
+ template <typename DstXprType, typename SrcXprNested> \
+ struct Assignment< \
+ DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>, \
+ assign_op<double, double>, Dense2Dense, \
+ std::enable_if_t< \
+ aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
+ typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested> \
+ SrcXprType; \
+ static void run(DstXprType &dst, const SrcXprType &src, \
+ const assign_op<double, double> &) { \
+ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
+ Eigen::Index n = dst.size(); \
+ eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+ if (n <= 0) \
+ return; \
+ const double *input = \
+ reinterpret_cast<const double *>(src.nestedExpression().data()); \
+ double *output = reinterpret_cast<double *>(dst.data()); \
+ int aocl_n = internal::convert_index<int>(n); \
+ AOCLOP(aocl_n, const_cast<double *>(input), output); \
+ } \
+ };
+
+// Instantiate unary calls for float (scalar).
+// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp)
+
+// Instantiate unary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2)
+
+// Binary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC) \
+ template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+ struct Assignment< \
+ DstXprType, \
+ CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
+ RhsXprNested>, \
+ assign_op<float, float>, Dense2Dense, \
+ std::enable_if_t<aocl_assign_binary_traits< \
+ DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
+ typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
+ RhsXprNested> \
+ SrcXprType; \
+ static void run(DstXprType &dst, const SrcXprType &src, \
+ const assign_op<float, float> &) { \
+ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
+ Eigen::Index n = dst.size(); \
+ if (n <= 0) \
+ return; \
+ const float *lhs = reinterpret_cast<const float *>(src.lhs().data()); \
+ const float *rhs = reinterpret_cast<const float *>(src.rhs().data()); \
+ float *output = reinterpret_cast<float *>(dst.data()); \
+ for (Eigen::Index i = 0; i < n; ++i) { \
+ output[i] = STDFUNC(lhs[i], rhs[i]); \
+ } \
+ } \
+ };
+
+// Binary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
+ template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+ struct Assignment< \
+ DstXprType, \
+ CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
+ RhsXprNested>, \
+ assign_op<double, double>, Dense2Dense, \
+ std::enable_if_t<aocl_assign_binary_traits< \
+ DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
+ typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
+ RhsXprNested> \
+ SrcXprType; \
+ static void run(DstXprType &dst, const SrcXprType &src, \
+ const assign_op<double, double> &) { \
+ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
+ Eigen::Index n = dst.size(); \
+ eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+ if (n <= 0) \
+ return; \
+ const double *lhs = reinterpret_cast<const double *>(src.lhs().data()); \
+ const double *rhs = reinterpret_cast<const double *>(src.rhs().data()); \
+ double *output = reinterpret_cast<double *>(dst.data()); \
+ int aocl_n = internal::convert_index<int>(n); \
+ AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \
+ } \
+ };
+
+// Instantiate binary calls for float (scalar).
+// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>) // Using
+// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow)
+
+// Instantiate binary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGN_AOCL_H
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 5ee67a5..f4f6794 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -32,6 +32,7 @@
#ifdef EIGEN_VECTORIZE_AVX2
EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tan, Packet4d)
#endif
EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 318b375..eafff3d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -110,6 +110,7 @@
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -143,6 +144,7 @@
#ifdef EIGEN_VECTORIZE_AVX2
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
#endif
HasTanh = EIGEN_FAST_MATH,
HasErf = 1,
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index ddc766b..c69ba15 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -119,6 +119,7 @@
HasConj = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -154,6 +155,7 @@
HasCbrt = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasLog1p = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index c98f217..acc2048 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -178,6 +178,7 @@
HasAbs = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -3098,6 +3099,7 @@
HasAbs = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 96d2783..13cdba7 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -4,6 +4,7 @@
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -641,7 +642,7 @@
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
// exp(r) is computed using a 6th order minimax polynomial approximation.
-template <typename Packet>
+template <typename Packet, bool IsFinite>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
const Packet cst_zero = pset1<Packet>(0.0f);
const Packet cst_one = pset1<Packet>(1.0f);
@@ -656,10 +657,15 @@
const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
-
- // Clamp x.
- Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
- Packet x = pmin(_x, cst_exp_hi);
+ Packet zero_mask;
+ Packet x;
+ if (!IsFinite) {
+ // Clamp x.
+ zero_mask = pcmp_lt(_x, cst_exp_lo);
+ x = pmin(_x, cst_exp_hi);
+ } else {
+ x = _x;
+ }
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
@@ -682,7 +688,9 @@
const Packet p_low = padd(r, cst_one);
Packet y = pmadd(r, p_odd, p_even);
y = pmadd(r2, y, p_low);
-
+ if (IsFinite) {
+ return pldexp_fast(y, m);
+ }
// Return 2^m * exp(r).
const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
if (!predux_any(fast_pldexp_unsafe)) {
@@ -765,6 +773,11 @@
return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
}
+// Enum for selecting which function to compute. SinCos is intended to compute
+// pairs of Sin and Cos of the even entries in the packet, e.g.
+// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
+enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
+
// The following code is inspired by the following stack-overflow answer:
// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
// It has been largely optimized:
@@ -821,7 +834,7 @@
return float(double(int64_t(p)) * pio2_62);
}
-template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
+template <TrigFunction Func, typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if EIGEN_COMP_GNUC_STRICT
__attribute__((optimize("-fno-unsafe-math-optimizations")))
@@ -851,7 +864,7 @@
#if defined(EIGEN_VECTORIZE_FMA)
// This version requires true FMA for high accuracy.
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
- const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+ constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
@@ -862,7 +875,7 @@
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
// and 2 ULP up to:
- const float huge_th = ComputeSine ? 25966.f : 18838.f;
+ constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
EIGEN_OPTIMIZATION_BARRIER(x)
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
@@ -900,13 +913,6 @@
y_int = ploadu<PacketI>(y_int2);
}
- // Compute the sign to apply to the polynomial.
- // sin: sign = second_bit(y_int) xor signbit(_x)
- // cos: sign = second_bit(y_int+1)
- Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
- : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
- sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
-
// Get the polynomial selection mask from the second bit of y_int
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
@@ -935,7 +941,15 @@
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynomials.
- if (ComputeBoth) {
+ // Compute the sign to apply to the polynomial.
+ // sin: sign = second_bit(y_int) xor signbit(_x)
+ // cos: sign = second_bit(y_int+1)
+ Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+ : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+
+ if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
+ // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
Packet peven = peven_mask(x);
Packet ysin = pselect(poly_mask, y2, y1);
Packet ycos = pselect(poly_mask, y1, y2);
@@ -943,23 +957,28 @@
Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
- y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
+ y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
+ : pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
} else {
- y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
+ y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
y = pxor(y, sign_bit);
}
- // Update the sign and filter huge inputs
return y;
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
- return psincos_float<true>(x);
+ return psincos_float<TrigFunction::Sin>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
- return psincos_float<false>(x);
+ return psincos_float<TrigFunction::Cos>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
+ return psincos_float<TrigFunction::Tan>(x);
}
// Trigonometric argument reduction for double for inputs smaller than 15.
@@ -999,7 +1018,7 @@
return t;
}
-template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
+template <TrigFunction Func, typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if EIGEN_COMP_GNUC_STRICT
__attribute__((optimize("-fno-unsafe-math-optimizations")))
@@ -1086,19 +1105,26 @@
Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
Packet sign_bit, sFinalRes;
- if (ComputeBoth) {
+ if (Func == TrigFunction::Sin) {
+ sign_bit = sign_sin;
+ sFinalRes = pselect(poly_mask, ssin, scos);
+ } else if (Func == TrigFunction::Cos) {
+ sign_bit = sign_cos;
+ sFinalRes = pselect(poly_mask, scos, ssin);
+ } else if (Func == TrigFunction::Tan) {
+ // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
+ sign_bit = pxor(sign_sin, sign_cos);
+ sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
+ } else if (Func == TrigFunction::SinCos) {
Packet peven = peven_mask(x);
sign_bit = pselect((s), sign_sin, sign_cos);
sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
- } else {
- sign_bit = ComputeSine ? sign_sin : sign_cos;
- sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
}
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
sFinalRes = pxor(sFinalRes, sign_bit);
// If the inputs values are higher than that a value that the argument reduction can currently address, compute them
- // using std::sin and std::cos
+ // using the C++ standard library.
// TODO Remove it when huge angle argument reduction is implemented
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
const int PacketSize = unpacket_traits<Packet>::size;
@@ -1109,10 +1135,15 @@
for (int k = 0; k < PacketSize; ++k) {
double val = x_cpy[k];
if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
- if (ComputeBoth)
+ if (Func == TrigFunction::Sin) {
+ sincos_vals[k] = std::sin(val);
+ } else if (Func == TrigFunction::Cos) {
+ sincos_vals[k] = std::cos(val);
+ } else if (Func == TrigFunction::Tan) {
+ sincos_vals[k] = std::tan(val);
+ } else if (Func == TrigFunction::SinCos) {
sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
- else
- sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
+ }
}
}
sFinalRes = ploadu<Packet>(sincos_vals);
@@ -1122,26 +1153,31 @@
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
- return psincos_double<true>(x);
+ return psincos_double<TrigFunction::Sin>(x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
- return psincos_double<false>(x);
+ return psincos_double<TrigFunction::Cos>(x);
}
-template <bool ComputeSin, typename Packet>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
+ return psincos_double<TrigFunction::Tan>(x);
+}
+
+template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
psincos_selector(const Packet& x) {
- return psincos_float<ComputeSin, Packet, true>(x);
+ return psincos_float<TrigFunction::SinCos, Packet>(x);
}
-template <bool ComputeSin, typename Packet>
+template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
psincos_selector(const Packet& x) {
- return psincos_double<ComputeSin, Packet, true>(x);
+ return psincos_double<TrigFunction::SinCos, Packet>(x);
}
// Generic implementation of acos(x).
@@ -1292,6 +1328,8 @@
return pxor(result, x_signmask);
}
+#ifdef EIGEN_FAST_MATH
+
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
Doesn't do anything fancy, just a 9/8-degree rational interpolant which
is accurate up to a couple of ulps in the (approximate) range [-8, 8],
@@ -1343,6 +1381,55 @@
return pdiv(p, q);
}
+#else
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise).
+ On the domain [-1.25:1.25] we use an approximation of the form
+ tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2.
+ For |x| > 1.25, tanh is implememented as tanh(x) = 1 - (2 / (1 + exp(2*x))).
+
+ This implementation has a maximum error of 1 ULP (measured with AVX2+FMA).
+
+ This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) {
+ // The polynomial coefficients were computed using Rminimax:
+ // % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even"
+ // --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly
+ constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f};
+ constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f};
+ const T x2 = pmul(x, x);
+ const T x3 = pmul(x2, x);
+ const T p = ppolevl<T, 1>::run(x2, alpha);
+ const T q = ppolevl<T, 2>::run(x2, beta);
+ const T small_tanh = pmadd(x3, pdiv(p, q), x);
+
+ const T sign_mask = pset1<T>(-0.0f);
+ const T abs_x = pandnot(x, sign_mask);
+ constexpr float kSmallThreshold = 1.25f;
+ const T large_mask = pcmp_lt(pset1<T>(kSmallThreshold), abs_x);
+ // Fast exit if all elements are small.
+ if (!predux_any(large_mask)) {
+ return small_tanh;
+ }
+
+ // Compute as 1 - (2 / (1 + exp(2*x)))
+ const T one = pset1<T>(1.0f);
+ const T two = pset1<T>(2.0f);
+ const T s = pexp_float<T, true>(pmul(two, abs_x));
+ const T abs_tanh = psub(one, pdiv(two, padd(s, one)));
+
+ // Handle infinite inputs and set sign bit.
+ constexpr float kHugeThreshold = 16.0f;
+ const T huge_mask = pcmp_lt(pset1<T>(kHugeThreshold), abs_x);
+ const T x_sign = pand(sign_mask, x);
+ const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh));
+ return pselect(large_mask, large_tanh, small_tanh);
+}
+
+#endif // EIGEN_FAST_MATH
+
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
This uses a 19/18-degree rational interpolant which
is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
@@ -1540,7 +1627,7 @@
// cis(y):
RealPacket y = pand(odd_mask, a.v);
y = por(y, pcplxflip(Packet(y)).v);
- RealPacket cisy = psincos_selector<false, RealPacket>(y);
+ RealPacket cisy = psincos_selector<RealPacket>(y);
cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 44ccb74..942ae12 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -95,7 +95,7 @@
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x);
/** \internal \returns exp(x) for single precision float */
-template <typename Packet>
+template <typename Packet, bool IsFinite = false>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
/** \internal \returns exp(x) for double precision real numbers */
@@ -110,6 +110,10 @@
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
+/** \internal \returns tan(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x);
+
/** \internal \returns sin(x) for double precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
@@ -118,6 +122,10 @@
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
+/** \internal \returns tan(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x);
+
/** \internal \returns asin(x) for single precision float */
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
@@ -200,6 +208,7 @@
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET) \
+ EIGEN_FLOAT_PACKET_FUNCTION(tan, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET) \
EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET) \
@@ -216,6 +225,7 @@
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET) \
+ EIGEN_DOUBLE_PACKET_FUNCTION(tan, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET) \
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6f93b15..bf50697 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -197,6 +197,7 @@
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -5017,6 +5018,7 @@
#endif
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasSqrt = 1,
HasRsqrt = 1,
HasCbrt = 1,
diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h
index 54db626..e0e0be4 100644
--- a/Eigen/src/Core/arch/RVV10/PacketMath.h
+++ b/Eigen/src/Core/arch/RVV10/PacketMath.h
@@ -507,6 +507,7 @@
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 1ea23b0..7d53fa2 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -183,6 +183,7 @@
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasACos = 1,
HasASin = 1,
HasATan = 1,
@@ -216,6 +217,7 @@
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
index 8c8ed84..5967433 100644
--- a/Eigen/src/Core/arch/SVE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -36,6 +36,11 @@
return pcos_float(x);
}
+template <>
+EIGEN_STRONG_INLINE PacketXf ptan<PacketXf>(const PacketXf& x) {
+ return ptan_float(x);
+}
+
// Hyperbolic Tangent function.
template <>
EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 28fc62b..39b29fa 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -353,6 +353,7 @@
HasCmp = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasTan = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasPow = 1,
diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h
index e142264..19e5e8f 100644
--- a/Eigen/src/Core/arch/clang/PacketMath.h
+++ b/Eigen/src/Core/arch/clang/PacketMath.h
@@ -56,6 +56,7 @@
HasReciprocal = 1,
HasSin = 1,
HasCos = 1,
+ HasTan = 1,
HasACos = 1,
HasASin = 1,
HasATan = 1,
diff --git a/Eigen/src/Core/util/AOCL_Support.h b/Eigen/src/Core/util/AOCL_Support.h
new file mode 100644
index 0000000..434ccfd
--- /dev/null
+++ b/Eigen/src/Core/util/AOCL_Support.h
@@ -0,0 +1,175 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This header file serves as the central configuration and integration point
+ * for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library.
+ * It orchestrates the integration of multiple AOCL components to provide
+ * optimal mathematical computing performance on AMD Zen family processors.
+ *
+ * AOCL Component Integration:
+ * ---------------------------
+ * 1. AOCL Vector Math Library (VML):
+ * - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions
+ * - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc.
+ * - SIMD vectorization for AMD architectures (AVX2, AVX-512)
+ * - Headers: amdlibm.h, amdlibm_vec.h
+ *
+ * 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software):
+ * - High-performance Basic Linear Algebra Subprograms
+ * - Supports single-threaded (libblis) and multithreaded (libblis-mt)
+ * variants
+ * - Optimized matrix operations: GEMM, GEMV, TRSM, etc.
+ * - Headers: cblas.h, blis.h
+ *
+ * 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment):
+ * - Dense linear algebra operations: factorizations, eigenvalue solvers
+ * - Matrix decompositions: LU, Cholesky, QR, SVD
+ * - Eigenvalue/eigenvector computations optimized for AMD hardware
+ * - Headers: LAPACKE interface
+ *
+ * ------------------------------
+ * EIGEN_AOCL_VML_THRESHOLD (default: 128):
+ * - Minimum vector size for AOCL VML dispatch
+ * - Smaller vectors use standard Eigen to avoid function call overhead
+ * - Optimal values: 64-512 depending on operation and data characteristics
+ *
+ *
+ *
+ * Architecture Support:
+ * ---------------------
+ * Optimized for AMD processor families:
+ * - Zen Architecture (Naples, Rome): AVX2 optimization
+ * - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2
+ * - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache
+ * - Zen 4 Architecture (Genoa, Raphael): AVX-512 support
+ * - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512
+ *
+ *
+ * Dependencies:
+ * -------------
+ * Required AOCL components:
+ * - libamdlibm: Core math library with VRDA functions
+ * - libblis or libblis-mt: BLAS implementation
+ * - libflame: LAPACK implementation
+ *
+ * System requirements:
+ * - AMD x86_64 processor (optimal performance)
+ * - Linux, Windows, or compatible POSIX system
+ * - C++11 or later standard
+ * - CMake 3.5+ for build system integration
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+#ifndef EIGEN_AOCL_SUPPORT_H
+#define EIGEN_AOCL_SUPPORT_H
+
+#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
+
+#include <complex>
+
+// Define AOCL component flags based on main flags
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
+
+// Enable Eigen BLAS backend only if BLIS provides compatible interface
+#if defined(EIGEN_AOCL_BLIS_COMPATIBLE)
+#define EIGEN_USE_BLAS // Enable Eigen BLAS backend
+#endif
+
+#define EIGEN_USE_LAPACKE // Enable LAPACK backend (FLAME)
+#endif
+
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
+
+// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts
+// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS
+// #define EIGEN_USE_BLAS // Commented out - causes conflicts with BLIS
+// interface
+
+// Note: LAPACKE disabled in MT mode to avoid header conflicts
+#define EIGEN_USE_LAPACKE // Commented out - causes conflicts with BLIS LAPACKE
+#define EIGEN_AOCL_USE_BLIS_MT 1 // Enable multithreaded BLIS
+#endif
+
+// Handle standalone EIGEN_USE_AOCL_VML flag
+#ifndef EIGEN_USE_AOCL_VML
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML
+#endif
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML
+#endif
+#endif
+
+// Configuration constants - define these for any AOCL usage
+#ifndef EIGEN_AOCL_VML_THRESHOLD
+#define EIGEN_AOCL_VML_THRESHOLD 128 // Threshold for VML dispatch
+#endif
+
+#ifndef AOCL_SIMD_WIDTH
+#define AOCL_SIMD_WIDTH 8 // AVX-512: 512 bits / 64 bits per double
+#endif
+
+// Include AOCL Math Library headers for VML
+#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) || \
+ defined(EIGEN_USE_AOCL_MT)
+#if defined(__has_include)
+#if __has_include("amdlibm.h")
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#if __has_include("amdlibm_vec.h")
+#include "amdlibm_vec.h"
+#endif
+#endif
+#else
+// Fallback for compilers without __has_include
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#include "amdlibm_vec.h"
+#endif
+#endif
+
+// Include CBLAS headers when BLAS is enabled
+#ifdef EIGEN_USE_AOCL_BLAS
+#if defined(__has_include)
+#if __has_include("cblas.h")
+#include "cblas.h"
+#elif __has_include("blis.h")
+#include "blis.h"
+#endif
+#else
+// Fallback
+#include "cblas.h"
+#endif
+#endif
+
+namespace Eigen {
+// AOCL-specific type definitions
+typedef std::complex<double> dcomplex;
+typedef std::complex<float> scomplex;
+typedef int BlasIndex; // Standard BLAS index type
+} // namespace Eigen
+
+#endif // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT
+
+#endif // EIGEN_AOCL_SUPPORT_H
diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index 45488d7..c8a2885 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -21,7 +21,9 @@
if (EIGEN_BUILD_SHARED_LIBS)
add_library(eigen_blas SHARED ${EigenBlas_SRCS} "eigen_blas.def")
target_compile_definitions(eigen_blas PUBLIC "EIGEN_BLAS_BUILD_DLL")
- set_target_properties(eigen_blas PROPERTIES CXX_VISIBILITY_PRESET hidden)
+ set_target_properties(eigen_blas PROPERTIES CXX_VISIBILITY_PRESET hidden
+ VERSION ${EIGEN_VERSION_NUMBER}
+ SOVERSION ${EIGEN_MAJOR_VERSION})
list(APPEND EIGEN_BLAS_TARGETS eigen_blas)
endif()
diff --git a/blas/blas.h b/blas/blas.h
index 2f218a8..220e956 100644
--- a/blas/blas.h
+++ b/blas/blas.h
@@ -59,6 +59,19 @@
EIGEN_BLAS_API void BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
EIGEN_BLAS_API void BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
+EIGEN_BLAS_API void BLASFUNC(saxpby)(const int *, const float *, const float *, const int *, const float *, float *,
+ const int *);
+EIGEN_BLAS_API void BLASFUNC(daxpby)(const int *, const double *, const double *, const int *, const double *, double *,
+ const int *);
+EIGEN_BLAS_API void BLASFUNC(qaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
+ const int *);
+EIGEN_BLAS_API void BLASFUNC(caxpby)(const int *, const float *, const float *, const int *, const float *, float *,
+ const int *);
+EIGEN_BLAS_API void BLASFUNC(zaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
+ const int *);
+EIGEN_BLAS_API void BLASFUNC(xaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
+ const int *);
+
EIGEN_BLAS_API void BLASFUNC(scopy)(int *, float *, int *, float *, int *);
EIGEN_BLAS_API void BLASFUNC(dcopy)(int *, double *, int *, double *, int *);
EIGEN_BLAS_API void BLASFUNC(qcopy)(int *, double *, int *, double *, int *);
diff --git a/blas/eigen_blas.def b/blas/eigen_blas.def
index fb18d44..20f26dc 100644
--- a/blas/eigen_blas.def
+++ b/blas/eigen_blas.def
@@ -13,6 +13,10 @@
zaxpy_
; caxpyc_
; zaxpyc_
+ saxpby_,
+ daxpby_,
+ caxpby_,
+ zaxpby_,
scopy_
dcopy_
ccopy_
@@ -91,8 +95,8 @@
; dmin_
; cmin_
; zmin_
-
-
+
+
; Level 2
sgemv_
dgemv_
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index a65af92..085b356 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -22,11 +22,42 @@
else if (*incx > 0 && *incy > 0)
make_vector(y, *n, *incy) += alpha * make_vector(x, *n, *incx);
else if (*incx > 0 && *incy < 0)
- make_vector(y, *n, -*incy).reverse() += alpha * make_vector(x, *n, *incx);
+ make_vector(y, *n, -*incy) += alpha * make_vector(x, *n, *incx).reverse();
else if (*incx < 0 && *incy > 0)
make_vector(y, *n, *incy) += alpha * make_vector(x, *n, -*incx).reverse();
else if (*incx < 0 && *incy < 0)
- make_vector(y, *n, -*incy).reverse() += alpha * make_vector(x, *n, -*incx).reverse();
+ make_vector(y, *n, -*incy) += alpha * make_vector(x, *n, -*incx);
+}
+
+EIGEN_BLAS_FUNC(axpby)
+(const int *pn, const RealScalar *palpha, const RealScalar *px, const int *pincx, const RealScalar *pbeta,
+ RealScalar *py, const int *pincy) {
+ const Scalar *x = reinterpret_cast<const Scalar *>(px);
+ Scalar *y = reinterpret_cast<Scalar *>(py);
+ const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
+ const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
+ const int n = *pn;
+
+ if (n <= 0) return;
+
+ if (Eigen::numext::equal_strict(beta, Scalar(1))) {
+ EIGEN_BLAS_FUNC_NAME(axpy)(pn, palpha, px, pincx, py, pincy);
+ return;
+ }
+
+ const int incx = *pincx;
+ const int incy = *pincy;
+
+ if (incx == 1 && incy == 1)
+ make_vector(y, n) = alpha * make_vector(x, n) + beta * make_vector(y, n);
+ else if (incx > 0 && incy > 0)
+ make_vector(y, n, incy) = alpha * make_vector(x, n, incx) + beta * make_vector(y, n, incy);
+ else if (incx > 0 && incy < 0)
+ make_vector(y, n, -incy) = alpha * make_vector(x, n, incx).reverse() + beta * make_vector(y, n, -incy);
+ else if (incx < 0 && incy > 0)
+ make_vector(y, n, incy) = alpha * make_vector(x, n, -incx).reverse() + beta * make_vector(y, n, incy);
+ else if (incx < 0 && incy < 0)
+ make_vector(y, n, -incy) = alpha * make_vector(x, n, -incx) + beta * make_vector(y, n, -incy);
}
EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) {
diff --git a/cmake/Eigen3ConfigVersion.cmake.in b/cmake/Eigen3ConfigVersion.cmake.in
index b680c63..dbbb5b5 100644
--- a/cmake/Eigen3ConfigVersion.cmake.in
+++ b/cmake/Eigen3ConfigVersion.cmake.in
@@ -1,5 +1,5 @@
# This is a CMake version file for the Config-mode of find_package().
-#
+#
# The version constraint is compatible with the current package under the
# following conditions:
# - If a version range is specified, the package version falls within the
@@ -12,7 +12,8 @@
# - 3...<5 matches 3.0.0.0 to <5.0.0.0
# - 3...<5.1 matches 3.0.0.0 to <5.1.0.0
# - 3 matches 3.0.0.0 to <4.0.0.0
-# - 3.4 matches 3.4.0.0 to <3.5.0.0
+# - 3.4 matches 3.4.0.0 to <4.0.0.0 due to semantic versioning
+# - 3.4 EXACT matches 3.4.0.0 to <3.5.0.0
set(PACKAGE_VERSION "@CVF_VERSION@")
@@ -65,26 +66,30 @@
set(PACKAGE_VERSION_COMPATIBLE TRUE)
endif()
else()
- # Create exclusive upper bound.
+
+ # Semantic versioning upper bound.
+ math(EXPR _PACKAGE_FIND_VERSION_MAJOR "${PACKAGE_FIND_VERSION_MAJOR}+1")
+ set(_PACKAGE_FIND_VERSION_SEMVER_UPPER "${_PACKAGE_FIND_VERSION_MAJOR}.0.0.0")
+
+ # Create exclusive upper bound for exact match.
if (PACKAGE_FIND_VERSION_COUNT EQUAL 1)
- math(EXPR _PACKAGE_FIND_VERSION_MAJOR "${PACKAGE_FIND_VERSION_MAJOR}+1")
- set(_PACKAGE_FIND_VERSION_UPPER "${_PACKAGE_FIND_VERSION_MAJOR}.0.0.0")
+ set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${_PACKAGE_FIND_VERSION_SEMVER_UPPER}")
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 2)
math(EXPR _PACKAGE_FIND_VERSION_MINOR "${PACKAGE_FIND_VERSION_MINOR}+1")
- set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${_PACKAGE_FIND_VERSION_MINOR}.0.0")
+ set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${_PACKAGE_FIND_VERSION_MINOR}.0.0")
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 3)
math(EXPR _PACKAGE_FIND_VERSION_PATCH "${PACKAGE_FIND_VERSION_PATCH}+1")
- set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${_PACKAGE_FIND_VERSION_PATCH}.0")
+ set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${_PACKAGE_FIND_VERSION_PATCH}.0")
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 4)
math(EXPR _PACKAGE_FIND_VERSION_TWEAK "${PACKAGE_FIND_VERSION_TWEAK}+1")
- set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}.${_PACKAGE_FIND_VERSION_TWEAK}")
+ set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}.${_PACKAGE_FIND_VERSION_TWEAK}")
endif()
- if((_PACKAGE_VERSION_FULL VERSION_LESS PACKAGE_FIND_VERSION) OR (_PACKAGE_VERSION_FULL VERSION_GREATER_EQUAL _PACKAGE_FIND_VERSION_UPPER))
+ if((_PACKAGE_VERSION_FULL VERSION_LESS PACKAGE_FIND_VERSION) OR (_PACKAGE_VERSION_FULL VERSION_GREATER_EQUAL _PACKAGE_FIND_VERSION_SEMVER_UPPER))
set(PACKAGE_VERSION_COMPATIBLE FALSE)
else()
set(PACKAGE_VERSION_COMPATIBLE TRUE)
- if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
+ if(_PACKAGE_VERSION_FULL VERSION_LESS _PACKAGE_FIND_VERSION_EXACT_UPPER)
set(PACKAGE_VERSION_EXACT TRUE)
endif()
endif()
diff --git a/cmake/FindAOCL.cmake b/cmake/FindAOCL.cmake
new file mode 100644
index 0000000..5e49835
--- /dev/null
+++ b/cmake/FindAOCL.cmake
@@ -0,0 +1,292 @@
+
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+#
+# FindAOCL.cmake - CMake Module for AMD Optimizing CPU Libraries (AOCL)
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Description:
+# ------------
+# This CMake module locates and configures AMD Optimizing CPU Libraries (AOCL)
+# for high-performance mathematical computing on AMD processors. It searches for
+# and sets up the following AOCL components:
+#
+# 1. AOCL MathLib (libamdlibm): Vector Math Library providing optimized
+# transcendental functions (exp, sin, cos, sqrt, log, etc.) with VRDA
+# (Vector Rapid Double-precision Arithmetic) support for SIMD acceleration
+#
+# 2. AOCL BLAS (BLIS): Basic Linear Algebra Subprograms optimized for AMD
+# architectures, supporting both single-threaded (libblis) and multithreaded
+# (libblis-mt) execution with OpenMP parallelization
+#
+# 3. AOCL LAPACK (libflame): Linear Algebra PACKage providing dense matrix
+# factorizations, eigenvalue/eigenvector computations, and linear system
+# solvers optimized for AMD processors
+#
+# The module automatically detects the appropriate library variants based on
+# configuration flags and provides proper linking setup for optimal performance
+# on Zen, Zen2, Zen3, Zen4, and Zen5 architectures.
+#
+# Variables Set:
+# --------------
+# AOCL_FOUND - True if AOCL libraries are found
+# AOCL_LIBRARIES - List of AOCL libraries to link against
+# AOCL_INCLUDE_DIRS - Include directories for AOCL headers
+# AOCL_BLAS_TYPE - Type of BLIS library found ("multithreaded" or "single-threaded")
+# AOCL_CORE_LIB - Path to core AOCL math library
+# AOCL_BLAS_LIB - Path to AOCL BLAS library
+# AOCL_LAPACK_LIB - Path to AOCL LAPACK library
+#
+# Configuration Options:
+# ----------------------
+# EIGEN_AOCL_BENCH_USE_MT - When ON, searches for multithreaded BLIS first
+# When OFF, searches for single-threaded BLIS only
+#
+# # For multithreaded BLIS:
+# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=ON
+#
+# # For single-threaded BLIS:
+# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=OFF
+#
+# Library Search Paths:
+# ---------------------
+# The module searches for AOCL libraries in the following order:
+# 1. ${AOCL_ROOT}/lib (or ${AOCL_ROOT}/lib32 for 32-bit)
+# 2. /opt/amd/aocl/lib64 (or /opt/amd/aocl/lib32 for 32-bit)
+# 3. ${LIB_INSTALL_DIR}
+#
+# Expected Library Names:
+# -----------------------
+# Core MathLib: amdlibm, alm, almfast
+# BLAS Single: blis
+# BLAS Multi: blis-mt
+# LAPACK: flame
+#
+# Dependencies:
+# -------------
+# The module automatically links the following system libraries:
+# - libm (standard math library)
+# - libpthread (POSIX threads)
+# - librt (real-time extensions)
+#
+# Architecture Support:
+# ---------------------
+# Optimized for AMD Zen family processors (Zen, Zen2, Zen3, Zen4, Zen5)
+# with automatic architecture detection and SIMD instruction selection.
+#
+# Developer:
+# ----------
+# Name: Sharad Saurabh Bhaskar
+# Email: shbhaska@amd.com
+#
+
+if(NOT DEFINED AOCL_ROOT)
+ if(DEFINED ENV{AOCL_ROOT})
+ set(AOCL_ROOT $ENV{AOCL_ROOT})
+ if (NOT AOCL_FIND_QUIETLY)
+ message(STATUS "AOCL_ROOT set from environment: ${AOCL_ROOT}")
+ endif()
+ else()
+ if (NOT AOCL_FIND_QUIETLY)
+ message(WARNING "AOCL_ROOT is not set. AOCL support will be disabled.")
+ endif()
+ set(AOCL_LIBRARIES "")
+ endif()
+endif()
+
+if(AOCL_LIBRARIES)
+ set(AOCL_FIND_QUIETLY TRUE)
+endif()
+
+# Determine default include directories
+set(AOCL_INCLUDE_DIRS "")
+if(AOCL_ROOT AND EXISTS "${AOCL_ROOT}/include")
+ list(APPEND AOCL_INCLUDE_DIRS "${AOCL_ROOT}/include")
+endif()
+if(EXISTS "/opt/amd/aocl/include")
+ list(APPEND AOCL_INCLUDE_DIRS "/opt/amd/aocl/include")
+endif()
+
+ if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+ # Search for the core AOCL math library.
+ find_library(AOCL_CORE_LIB
+ NAMES amdlibm alm almfast
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib64
+ ${LIB_INSTALL_DIR}
+ )
+ if (NOT AOCL_FIND_QUIETLY)
+ if(AOCL_CORE_LIB)
+ message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
+ else()
+ message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
+ endif()
+ endif()
+
+ # Conditional BLIS library search based on MT requirement
+ if(EIGEN_AOCL_BENCH_USE_MT)
+ # Search for multithreaded BLIS first
+ find_library(AOCL_BLAS_LIB
+ NAMES blis-mt
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib64
+ ${LIB_INSTALL_DIR}
+ )
+ if(AOCL_BLAS_LIB)
+ if (NOT AOCL_FIND_QUIETLY)
+ message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
+ endif()
+ set(AOCL_BLAS_TYPE "multithreaded")
+ else()
+ if (NOT AOCL_FIND_QUIETLY)
+ message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
+ endif()
+ find_library(AOCL_BLAS_LIB
+ NAMES blis
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib64
+ ${LIB_INSTALL_DIR}
+ )
+ set(AOCL_BLAS_TYPE "single-threaded")
+ endif()
+ else()
+ # Search for single-threaded BLIS
+ find_library(AOCL_BLAS_LIB
+ NAMES blis
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib64
+ ${LIB_INSTALL_DIR}
+ )
+ if(AOCL_BLAS_LIB)
+ if (NOT AOCL_FIND_QUIETLY)
+ message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
+ endif()
+ set(AOCL_BLAS_TYPE "single-threaded")
+ else()
+ if (NOT AOCL_FIND_QUIETLY)
+ message(WARNING "AOCL single-threaded BLAS library not found.")
+ endif()
+ endif()
+ endif()
+
+ # Now search for AOCL LAPACK library.
+ find_library(AOCL_LAPACK_LIB
+ NAMES flame
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib64
+ ${LIB_INSTALL_DIR}
+ )
+ if (NOT AOCL_FIND_QUIETLY)
+ if(AOCL_LAPACK_LIB)
+ message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
+ else()
+ message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
+ endif()
+ endif()
+
+ else()
+ # For 32-bit systems, similar search paths.
+ find_library(AOCL_CORE_LIB
+ NAMES amdlibm alm almfast
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib32
+ ${LIB_INSTALL_DIR}
+ )
+ if (NOT AOCL_FIND_QUIETLY)
+ if(AOCL_CORE_LIB)
+ message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
+ else()
+ message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
+ endif()
+ endif()
+
+ # Conditional BLIS library search for 32-bit
+ if(EIGEN_AOCL_BENCH_USE_MT)
+ find_library(AOCL_BLAS_LIB
+ NAMES blis-mt
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib32
+ ${LIB_INSTALL_DIR}
+ )
+ if(AOCL_BLAS_LIB)
+ if (NOT AOCL_FIND_QUIETLY)
+ message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
+ endif()
+ set(AOCL_BLAS_TYPE "multithreaded")
+ else()
+ if (NOT AOCL_FIND_QUIETLY)
+ message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
+ endif()
+ find_library(AOCL_BLAS_LIB
+ NAMES blis
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib32
+ ${LIB_INSTALL_DIR}
+ )
+ set(AOCL_BLAS_TYPE "single-threaded")
+ endif()
+ else()
+ find_library(AOCL_BLAS_LIB
+ NAMES blis
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib32
+ ${LIB_INSTALL_DIR}
+ )
+ if(AOCL_BLAS_LIB)
+ if (NOT AOCL_FIND_QUIETLY)
+ message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
+ endif()
+ set(AOCL_BLAS_TYPE "single-threaded")
+ else()
+ if (NOT AOCL_FIND_QUIETLY)
+ message(WARNING "AOCL single-threaded BLAS library not found.")
+ endif()
+ endif()
+ endif()
+
+ find_library(AOCL_LAPACK_LIB
+ NAMES flame
+ PATHS
+ ${AOCL_ROOT}/lib
+ /opt/amd/aocl/lib32
+ ${LIB_INSTALL_DIR}
+ )
+ if (NOT AOCL_FIND_QUIETLY)
+ if(AOCL_LAPACK_LIB)
+ message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
+ else()
+ message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
+ endif()
+ endif()
+endif()
+
+# Combine the found libraries into one variable.
+if(AOCL_CORE_LIB)
+ set(AOCL_LIBRARIES ${AOCL_CORE_LIB})
+endif()
+if(AOCL_BLAS_LIB)
+ list(APPEND AOCL_LIBRARIES ${AOCL_BLAS_LIB})
+endif()
+if(AOCL_LAPACK_LIB)
+ list(APPEND AOCL_LIBRARIES ${AOCL_LAPACK_LIB})
+endif()
+if(AOCL_LIBRARIES)
+ # Link against the standard math and pthread libraries as well as librt
+ list(APPEND AOCL_LIBRARIES m pthread rt)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(AOCL DEFAULT_MSG AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
+mark_as_advanced(AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
diff --git a/doc/UsingAOCL.dox b/doc/UsingAOCL.dox
new file mode 100644
index 0000000..24ce698
--- /dev/null
+++ b/doc/UsingAOCL.dox
@@ -0,0 +1,289 @@
+/*
+ Copyright (c) 2025, AMD Inc. All rights reserved.
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of AMD nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ********************************************************************************
+ * Content : Documentation on the use of AMD AOCL through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingAOCL Using AMD® AOCL from %Eigen
+
+Since %Eigen version 3.4 and later, users can benefit from built-in AMD® Optimizing CPU Libraries (AOCL) optimizations with an installed copy of AOCL 5.0 (or later).
+
+<a href="https://www.amd.com/en/developer/aocl.html"> AMD AOCL </a> provides highly optimized, multi-threaded mathematical routines for x86-64 processors with a focus on AMD "Zen"-based architectures. AOCL is available on Linux and Windows for x86-64 architectures.
+
+\note
+AMD® AOCL is freely available software, but it is the responsibility of users to download, install, and ensure their product's license allows linking to the AOCL libraries. AOCL is distributed under a permissive license that allows commercial use.
+
+Using AMD AOCL through %Eigen is straightforward:
+-# export \c AOCL_ROOT into your environment
+-# define one of the AOCL macros before including any %Eigen headers (see table below)
+-# link your program to AOCL libraries (BLIS, FLAME, LibM)
+-# ensure your system supports the target architecture optimizations
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to AMD AOCL routines.
+These substitutions apply only for \b Dynamic \b or \b large \b enough objects with one of the following standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The AOCL integration targets three core components:
+- **BLIS**: High-performance BLAS implementation optimized for modern cache hierarchies
+- **FLAME**: Dense linear algebra algorithms providing LAPACK functionality
+- **LibM**: Optimized standard math routines with vectorized implementations
+
+\section TopicUsingAOCL_Macros Configuration Macros
+
+You can choose which parts will be substituted by defining one or multiple of the following macros:
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (AOCL-BLIS)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external LAPACK routines via the LAPACKE C interface (AOCL-FLAME)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower robustness are disabled. \n This currently concerns only JacobiSVD which would be replaced by \c gesvd.</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_AOCL_VML </td><td>Enables the use of AOCL LibM vector math operations for coefficient-wise functions</td></tr>
+<tr><td>\c EIGEN_USE_AOCL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_AOCL_VML</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_AOCL_MT </td><td>Equivalent to \c EIGEN_USE_AOCL_ALL, but ensures multi-threaded BLIS (\c libblis-mt) is used. \n \b Recommended for most applications.</td></tr>
+</table>
+
+\note The AOCL integration automatically enables optimizations when the matrix/vector size exceeds \c EIGEN_AOCL_VML_THRESHOLD (default: 128 elements). For smaller operations, Eigen's built-in vectorization may be faster due to function call overhead.
+
+\section TopicUsingAOCL_Performance Performance Considerations
+
+The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE macros can be combined with AOCL-specific optimizations:
+
+- **Multi-threading**: Use \c EIGEN_USE_AOCL_MT to automatically select the multi-threaded BLIS library
+- **Architecture targeting**: AOCL libraries are optimized for AMD Zen architectures (Zen, Zen2, Zen3, Zen4, Zen5)
+- **Vector Math Library**: AOCL LibM provides vectorized implementations that can operate on entire arrays simultaneously
+- **Memory layout**: Eigen's column-major storage directly matches AOCL's expected data layout for zero-copy operation
+
+\section TopicUsingAOCL_Types Supported Data Types and Sizes
+
+AOCL acceleration is applied to:
+- **Scalar types**: \c float, \c double, \c complex<float>, \c complex<double>
+- **Matrix/Vector sizes**: Dynamic size or compile-time size ≥ \c EIGEN_AOCL_VML_THRESHOLD
+- **Storage order**: Both column-major (default) and row-major layouts
+- **Memory alignment**: Eigen's data pointers are directly compatible with AOCL function signatures
+
+The current AOCL Vector Math Library integration is specialized for \c double precision, with automatic fallback to scalar implementations for \c float.
+
+\section TopicUsingAOCL_Functions Vector Math Functions
+
+The following table summarizes coefficient-wise operations accelerated by \c EIGEN_USE_AOCL_VML:
+
+<table class="manual">
+<tr><th>Code example</th><th>AOCL routines</th></tr>
+<tr><td>\code
+v2 = v1.array().exp();
+v2 = v1.array().sin();
+v2 = v1.array().cos();
+v2 = v1.array().tan();
+v2 = v1.array().log();
+v2 = v1.array().log10();
+v2 = v1.array().log2();
+v2 = v1.array().sqrt();
+v2 = v1.array().pow(1.5);
+v2 = v1.array() + v2.array();
+\endcode</td><td>\code
+amd_vrda_exp
+amd_vrda_sin
+amd_vrda_cos
+amd_vrda_tan
+amd_vrda_log
+amd_vrda_log10
+amd_vrda_log2
+amd_vrda_sqrt
+amd_vrda_pow
+amd_vrda_add
+\endcode</td></tr>
+</table>
+
+In the examples, v1 and v2 are dense vectors of type \c VectorXd with size ≥ \c EIGEN_AOCL_VML_THRESHOLD.
+
+\section TopicUsingAOCL_Example Complete Example
+
+\code
+#define EIGEN_USE_AOCL_MT
+#include <iostream>
+#include <Eigen/Dense>
+
+int main() {
+ const int n = 2048;
+
+ // Large matrices automatically use AOCL-BLIS for multiplication
+ Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
+ Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
+ Eigen::MatrixXd C = A * B; // Dispatched to dgemm
+
+ // Large vectors automatically use AOCL LibM for math functions
+ Eigen::VectorXd v = Eigen::VectorXd::LinSpaced(10000, 0, 10);
+ Eigen::VectorXd result = v.array().sin(); // Dispatched to amd_vrda_sin
+
+ // LAPACK decompositions use AOCL-FLAME
+ Eigen::LLT<Eigen::MatrixXd> llt(A); // Dispatched to dpotrf
+
+ std::cout << "Matrix norm: " << C.norm() << std::endl;
+ std::cout << "Vector result norm: " << result.norm() << std::endl;
+
+ return 0;
+}
+\endcode
+
+\section TopicUsingAOCL_Building Building and Linking
+
+To compile with AOCL support, set the \c AOCL_ROOT environment variable and link against the required libraries:
+
+\code
+export AOCL_ROOT=/path/to/aocl
+clang++ -O3 -g -DEIGEN_USE_AOCL_ALL \
+ -I./install/include -I${AOCL_ROOT}/include \
+ -Wno-parentheses my_app.cpp \
+ -L${AOCL_ROOT} -lamdlibm -lflame -lblis \
+ -lpthread -lrt -lm -lomp \
+ -o eigen_aocl_example
+\endcode
+
+For multi-threaded performance, use the multi-threaded BLIS library:
+\code
+clang++ -O3 -g -DEIGEN_USE_AOCL_MT \
+ -I./install/include -I${AOCL_ROOT}/include \
+ -Wno-parentheses my_app.cpp \
+ -L${AOCL_ROOT} -lamdlibm -lflame -lblis-mt \
+ -lpthread -lrt -lm -lomp \
+ -o eigen_aocl_example
+\endcode
+
+Key compiler and linker flags:
+- \c -DEIGEN_USE_AOCL_ALL: Enable all AOCL accelerations (BLAS, LAPACK, VML)
+- \c -DEIGEN_USE_AOCL_MT: Enable multi-threaded version (uses \c -lblis-mt)
+- \c -lblis: Single-threaded BLIS library
+- \c -lblis-mt: Multi-threaded BLIS library (recommended for performance)
+- \c -lflame: FLAME LAPACK implementation
+- \c -lamdlibm: AMD LibM vector math library
+- \c -lomp: OpenMP runtime for multi-threading support
+- \c -lpthread -lrt: System threading and real-time libraries
+- \c -Wno-parentheses: Suppress common warnings when using AOCL headers
+
+\subsection TopicUsingAOCL_EigenBuild Building Eigen with AOCL Support
+
+To build Eigen with AOCL Support, use the following CMake configuration:
+
+\code
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_C_COMPILER=clang \
+ -DCMAKE_CXX_COMPILER=clang++ \
+ -DCMAKE_INSTALL_PREFIX=$PWD/install \
+ -DINCLUDE_INSTALL_DIR=$PWD/install/include \
+ && make install -j$(nproc)
+\endcode
+
+
+To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration:
+
+\code
+cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \
+ -DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \
+ -DEIGEN_AOCL_BENCH_USE_MT=OFF \
+ -DEIGEN_AOCL_BENCH_ARCH=znver5 \
+ -DCMAKE_BUILD_TYPE=Debug \
+ -DCMAKE_C_COMPILER=clang \
+ -DCMAKE_CXX_COMPILER=clang++ \
+ -DCMAKE_INSTALL_PREFIX=$PWD/install \
+ -DINCLUDE_INSTALL_DIR=$PWD/install/include \
+ && make install -j$(nproc)
+\endcode
+
+**CMake Configuration Parameters:**
+
+<table class="manual">
+<tr><th>Parameter</th><th>Expected Values</th><th>Description</th></tr>
+<tr><td>\c EIGEN_BUILD_AOCL_BENCH</td><td>\c ON, \c OFF</td><td>Enable/disable AOCL benchmark compilation</td></tr>
+<tr class="alt"><td>\c EIGEN_AOCL_BENCH_FLAGS</td><td>Compiler flags string</td><td>Additional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"</td></tr>
+<tr><td>\c EIGEN_AOCL_BENCH_USE_MT</td><td>\c ON, \c OFF</td><td>Use multi-threaded AOCL libraries (\c ON recommended for performance)</td></tr>
+<tr class="alt"><td>\c EIGEN_AOCL_BENCH_ARCH</td><td>\c znver3, \c znver4, \c znver5, \c native, \c generic</td><td>Target AMD architecture (match your CPU generation)</td></tr>
+<tr><td>\c CMAKE_BUILD_TYPE</td><td>\c Release, \c Debug, \c RelWithDebInfo</td><td>Build configuration (\c Release recommended for benchmarks)</td></tr>
+<tr class="alt"><td>\c CMAKE_C_COMPILER</td><td>\c clang, \c gcc</td><td>C compiler (clang recommended for AOCL)</td></tr>
+<tr><td>\c CMAKE_CXX_COMPILER</td><td>\c clang++, \c g++</td><td>C++ compiler (clang++ recommended for AOCL)</td></tr>
+<tr class="alt"><td>\c CMAKE_INSTALL_PREFIX</td><td>Installation path</td><td>Where to install Eigen headers</td></tr>
+<tr><td>\c INCLUDE_INSTALL_DIR</td><td>Header path</td><td>Specific path for Eigen headers</td></tr>
+</table>
+
+**Architecture Selection Guide:**
+- \c znver3: AMD Zen 3 (EPYC 7003, Ryzen 5000 series)
+- \c znver4: AMD Zen 4 (EPYC 9004, Ryzen 7000 series)
+- \c znver5: AMD Zen 5 (EPYC 9005, Ryzen 9000 series)
+- \c native: Auto-detect current CPU architecture
+- \c generic: Generic x86-64 without specific optimizations
+
+**Custom Compiler Flags Explanation:**
+- \c -O3: Maximum optimization level
+- \c -mavx512f: Enable AVX-512 instruction set (if supported)
+- \c -fveclib=AMDLIBM: Use AMD LibM for vectorized math functions
+
+\subsection TopicUsingAOCL_Benchmark Building the AOCL Benchmark
+
+After configuring Eigen, build the AOCL benchmark executable:
+
+\code
+cmake --build . --target benchmark_aocl -j$(nproc)
+\endcode
+
+This creates the \c benchmark_aocl executable that demonstrates AOCL acceleration with various matrix sizes and operations.
+
+**Running the Benchmark:**
+\code
+./benchmark_aocl
+\endcode
+
+The benchmark will automatically compare:
+- Eigen's native performance vs AOCL-accelerated operations
+- Matrix multiplication performance (BLIS vs Eigen)
+- Vector math functions performance (LibM vs Eigen)
+- Memory bandwidth utilization and cache efficiency
+
+\section TopicUsingAOCL_CMake CMake Integration
+
+When using CMake, you can use a FindAOCL module:
+
+\code
+find_package(AOCL REQUIRED)
+target_compile_definitions(my_target PRIVATE EIGEN_USE_AOCL_MT)
+target_link_libraries(my_target PRIVATE AOCL::BLIS_MT AOCL::FLAME AOCL::LIBM)
+\endcode
+
+\section TopicUsingAOCL_Troubleshooting Troubleshooting
+
+Common issues and solutions:
+
+- **Link errors**: Ensure \c AOCL_ROOT is set and libraries are in \c LD_LIBRARY_PATH
+- **Performance not improved**: Verify you're using matrices/vectors larger than the threshold
+- **Thread contention**: Set \c OMP_NUM_THREADS to match your CPU core count
+- **Architecture mismatch**: Use appropriate \c -march flag for your AMD processor
+
+\section TopicUsingAOCL_Links Links
+
+- AMD AOCL can be downloaded for free <a href="https://www.amd.com/en/developer/aocl.html">here</a>
+- AOCL User Guide and documentation available on the AMD Developer Portal
+- AOCL is also available through package managers and containerized environments
+
+*/
+
+}
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index d837fff..51bb455 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -110,7 +110,9 @@
# Build LAPACK but link BLAS.
target_compile_definitions(eigen_lapack PUBLIC "EIGEN_BLAS_LINK_DLL" "EIGEN_LAPACK_BUILD_DLL")
target_link_libraries(eigen_lapack eigen_blas)
- set_target_properties(eigen_lapack PROPERTIES CXX_VISIBILITY_PRESET hidden)
+ set_target_properties(eigen_lapack PROPERTIES CXX_VISIBILITY_PRESET hidden
+ VERSION ${EIGEN_VERSION_NUMBER}
+ SOVERSION ${EIGEN_MAJOR_VERSION})
list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack)
endif()