blob: 8fdc8ba5fd5fdb1180545d51f0e2e3ce1027cda5 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
#include <unordered_set>
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace TensorSycl {
namespace internal {
/// Cache all the device information needed
struct SyclDeviceInfo {
SyclDeviceInfo(cl::sycl::queue queue)
: local_mem_type(
queue.get_device()
.template get_info<cl::sycl::info::device::local_mem_type>()),
max_work_item_sizes(
queue.get_device()
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
max_mem_alloc_size(
queue.get_device()
.template get_info<
cl::sycl::info::device::max_mem_alloc_size>()),
max_compute_units(queue.get_device()
.template get_info<
cl::sycl::info::device::max_compute_units>()),
max_work_group_size(
queue.get_device()
.template get_info<
cl::sycl::info::device::max_work_group_size>()),
local_mem_size(
queue.get_device()
.template get_info<cl::sycl::info::device::local_mem_size>()),
platform_name(queue.get_device()
.get_platform()
.template get_info<cl::sycl::info::platform::name>()),
device_name(queue.get_device()
.template get_info<cl::sycl::info::device::name>()),
device_vendor(
queue.get_device()
.template get_info<cl::sycl::info::device::vendor>()) {}
cl::sycl::info::local_mem_type local_mem_type;
cl::sycl::id<3> max_work_item_sizes;
unsigned long max_mem_alloc_size;
unsigned long max_compute_units;
unsigned long max_work_group_size;
size_t local_mem_size;
std::string platform_name;
std::string device_name;
std::string device_vendor;
};
} // end namespace internal
} // end namespace TensorSycl
typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
// TensorFlow via the Eigen SYCL Backend.
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
-> decltype(cl::sycl::device::get_devices()) {
#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
return {cl::sycl::device(cl::sycl::default_selector())};
#else
std::vector<cl::sycl::device> supported_devices;
auto platform_list = cl::sycl::platform::get_platforms();
for (const auto &platform : platform_list) {
auto device_list = platform.get_devices();
auto platform_name =
platform.template get_info<cl::sycl::info::platform::name>();
std::transform(platform_name.begin(), platform_name.end(),
platform_name.begin(), ::tolower);
for (const auto &device : device_list) {
auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
bool unsupported_condition =
(device.is_cpu() && platform_name.find("amd") != std::string::npos &&
vendor.find("apu") == std::string::npos) ||
(platform_name.find("experimental") != std::string::npos) ||
device.is_host();
if (!unsupported_condition) {
supported_devices.push_back(device);
}
}
}
return supported_devices;
#endif
}
class QueueInterface {
public:
/// Creating device by using cl::sycl::selector or cl::sycl::device.
template <typename DeviceOrSelector>
explicit QueueInterface(
const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
unsigned num_threads = std::thread::hardware_concurrency())
: m_queue(dev_or_sel, handler),
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
#endif
m_thread_pool(num_threads),
m_device_info(m_queue) {
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog.build_with_kernel_type<DeviceOrSelector>();
auto f = [&](cl::sycl::handler &cgh) {
cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
[=]() {})
};
EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
#endif
}
template <typename DeviceOrSelector>
explicit QueueInterface(
const DeviceOrSelector &dev_or_sel,
unsigned num_threads = std::thread::hardware_concurrency())
: QueueInterface(dev_or_sel,
[this](cl::sycl::exception_list l) {
this->exception_caught_ = this->sycl_async_handler(l);
},
num_threads) {}
explicit QueueInterface(
const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency())
: m_queue(q),
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
#endif
m_thread_pool(num_threads),
m_device_info(m_queue) {}
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
#endif
/// Attach an existing buffer to the pointer map, Eigen will not reuse it
EIGEN_STRONG_INLINE void *attach_buffer(
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return static_cast<void *>(pMapper.add_pointer(buf));
}
/// Detach previously attached buffer
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfree<false>(p, pMapper);
}
/// Allocating device pointer. This pointer is actually an 8 bytes host
/// pointer used as key to access the sycl device buffer. The reason is that
/// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
/// expressions. So we create a key pointer to be used in Eigen expression
/// construction. When we convert the Eigen construction into the sycl
/// construction we use this pointer as a key in our buffer_map and we make
/// sure that we dedicate only one buffer only for this pointer. The device
/// pointer would be deleted by calling deallocate function.
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
#if EIGEN_MAX_ALIGN_BYTES > 0
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
if (align > 0) {
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
}
#endif
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
#if EIGEN_MAX_ALIGN_BYTES > 0
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
if (align > 0) {
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
}
#endif
std::lock_guard<std::mutex> lock(pmapper_mutex_);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
if (scratch_buffers.empty()) {
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
;
} else {
for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
auto buff = pMapper.get_buffer(*it);
if (buff.get_size() >= num_bytes) {
auto ptr = *it;
scratch_buffers.erase(it);
return ptr;
} else {
++it;
}
}
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
#else
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
#endif
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, data_t>
get(data_t *data) const {
return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
data_t>
data) const {
return static_cast<data_t *>(data.get_virtual_pointer());
}
EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.insert(p);
#else
TensorSycl::internal::SYCLfree(p, pMapper);
#endif
}
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE void deallocate_temp(
const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
deallocate_temp(p.get_virtual_pointer());
}
/// This is used to deallocate the device pointer. p is used as a key inside
/// the map to find the device buffer and delete it.
EIGEN_STRONG_INLINE void deallocate(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfree(p, pMapper);
}
EIGEN_STRONG_INLINE void deallocate_all() const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfreeAll(pMapper);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.clear();
#endif
}
/// The memcpyHostToDevice is used to copy the data from host to device
/// The destination pointer could be deleted before the copy happened which is
/// why a callback function is needed. By default if none is provided, the
/// function is blocking.
EIGEN_STRONG_INLINE void memcpyHostToDevice(
void *dst, const void *src, size_t n,
std::function<void()> callback) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
static const auto global_access = cl::sycl::access::target::global_buffer;
typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
write_accessor;
if (n == 0) {
if (callback) callback();
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
auto non_deleter = [](buffer_scalar_t const *) {};
std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
cgh.copy(s_ptr, dst_acc);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
synchronize_and_callback(e, callback);
}
/// The memcpyDeviceToHost is used to copy the data from device to host.
/// The source pointer could be deleted before the copy happened which is
/// why a callback function is needed. By default if none is provided, the
/// function is blocking.
EIGEN_STRONG_INLINE void memcpyDeviceToHost(
void *dst, const void *src, size_t n,
std::function<void()> callback) const {
static const auto read_mode = cl::sycl::access::mode::read;
static const auto global_access = cl::sycl::access::target::global_buffer;
typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
read_accessor;
if (n == 0) {
if (callback) callback();
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
auto non_deleter = [](buffer_scalar_t *) {};
std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
cgh.copy(src_acc, s_ptr);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
synchronize_and_callback(e, callback);
}
/// The memcpy function.
/// No callback is required here as both arguments are on the device
/// and SYCL can handle the dependency.
EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
static const auto read_mode = cl::sycl::access::mode::read;
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (n == 0) {
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
cgh.copy(src_acc, dst_acc);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
}
/// the memset function.
/// No callback is required here as both arguments are on the device
/// and SYCL can handle the dependency.
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (n == 0) {
return;
}
auto f = [&](cl::sycl::handler &cgh) {
// Get a typed range accesser to ensure we fill each byte, in case
// `buffer_scalar_t` is not (u)int8_t.
auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
cgh.fill(dst_acc, static_cast<uint8_t>(c));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
}
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (begin == end) {
return;
}
const ptrdiff_t count = end - begin;
auto f = [&](cl::sycl::handler &cgh) {
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
cgh.fill(dst_acc, value);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
}
/// Get a range accessor to the virtual pointer's device memory. This range
/// accessor will allow access to the memory from the pointer to the end of
/// the buffer.
///
/// NOTE: Inside a kernel the range accessor will always be indexed from the
/// start of the buffer, so the offset in the accessor is only used by
/// methods like handler::copy and will not be available inside a kernel.
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessor(const void *ptr) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto original_buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T);
eigen_assert(typed_offset >= 0);
const auto typed_size = original_buffer.get_size() / sizeof(T);
auto buffer = original_buffer.template reinterpret<
std::remove_const_t<T>>(
cl::sycl::range<1>(typed_size));
const ptrdiff_t size = buffer.get_count() - typed_offset;
eigen_assert(size >= 0);
typedef cl::sycl::accessor<std::remove_const_t<T>,
1, AcMd, global_access, is_place_holder>
placeholder_accessor_t;
const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
cl::sycl::id<1>(typed_offset)),
static_cast<size_t>(typed_offset),
reinterpret_cast<std::intptr_t>(start_ptr));
}
/// Get a range accessor to the virtual pointer's device memory with a
/// specified size.
template <cl::sycl::access::mode AcMd, typename Index>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
const Index n_bytes) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
eigen_assert(n_bytes >= 0);
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset >= 0);
eigen_assert(offset + n_bytes <= buffer.get_size());
return buffer.template get_access<AcMd, global_access>(
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
}
/// Get a range accessor to the virtual pointer's device memory with a
/// specified type and count.
template <cl::sycl::access::mode AcMd, typename T, typename Index>
EIGEN_STRONG_INLINE cl::sycl::accessor<
T, 1, AcMd, cl::sycl::access::target::global_buffer>
get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
const Index count) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
eigen_assert(count >= 0);
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset >= 0);
// Technically we should create a subbuffer for the desired range,
// then reinterpret that. However, I was not able to get changes to reflect
// in the original buffer (only the subbuffer and reinterpretted buffer).
// This current implementation now has the restriction that the buffer
// offset and original buffer size must be a multiple of sizeof(T).
// Note that get_range_accessor(void*) currently has the same restriction.
//
// auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer,
// cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T);
const size_t typed_size = buffer.get_size() / sizeof(T);
auto reint = buffer.template reinterpret<
std::remove_const_t<T>>(
cl::sycl::range<1>(typed_size));
return reint.template get_access<AcMd, global_access>(
cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
}
/// Creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if
/// not, the function then adds an entry by creating a sycl buffer for that
/// particular pointer.
template <cl::sycl::access::mode AcMd>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_buffer(ptr)
.template get_access<AcMd, cl::sycl::access::target::global_buffer>(
cgh);
}
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_buffer(ptr);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_offset(ptr);
}
template <typename OutScalar, typename sycl_kernel, typename Lhs,
typename Rhs, typename OutPtr, typename Range, typename Index,
typename... T>
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
const Rhs &rhs, OutPtr outptr,
Range thread_range,
Index scratchSize,
T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
lhs.bind(cgh);
rhs.bind(cgh);
outptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
}
template <typename OutScalar, typename sycl_kernel, typename InPtr,
typename OutPtr, typename Range, typename Index, typename... T>
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
OutPtr &outptr,
Range thread_range,
Index scratchSize,
T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
inptr.bind(cgh);
outptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, inptr, outptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
}
template <typename OutScalar, typename sycl_kernel, typename InPtr,
typename Range, typename Index, typename... T>
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
Range thread_range,
Index scratchSize,
T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
inptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, inptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
}
EIGEN_STRONG_INLINE void synchronize() const {
#ifdef EIGEN_EXCEPTIONS
m_queue.wait_and_throw();
#else
m_queue.wait();
#endif
}
EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
set_latest_event(e);
#ifndef EIGEN_SYCL_ASYNC_EXECUTION
synchronize();
#endif
}
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
Index &rng, Index &GRange) const {
tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(tileSize));
rng = n;
if (rng == 0) rng = static_cast<Index>(1);
GRange = rng;
if (tileSize > GRange)
tileSize = GRange;
else if (GRange > tileSize) {
Index xMode = static_cast<Index>(GRange % tileSize);
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
}
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
cl::sycl::range<2> &local_range) const {
std::array<Index, 2> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
input_range[1] = input_dim[1];
if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
global_range[1] = input_range[1];
if (local_range[1] > global_range[1])
local_range[1] = global_range[1];
else if (global_range[1] > local_range[1]) {
Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
if (xMode != 0)
global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
input_range[0] = input_dim[0];
if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
global_range[0] = input_range[0];
if (local_range[0] > global_range[0])
local_range[0] = global_range[0];
else if (global_range[0] > local_range[0]) {
Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
if (xMode != 0)
global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
cl::sycl::range<3> &local_range) const {
std::array<Index, 3> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
local_range[2] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
input_range[2] = input_dim[2];
if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
global_range[2] = input_range[2];
if (local_range[2] > global_range[2])
local_range[2] = global_range[2];
else if (global_range[2] > local_range[2]) {
Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
if (xMode != 0)
global_range[2] += static_cast<Index>(local_range[2] - xMode);
}
pow_of_2 = static_cast<Index>(
std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
input_range[1] = input_dim[1];
if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
global_range[1] = input_range[1];
if (local_range[1] > global_range[1])
local_range[1] = global_range[1];
else if (global_range[1] > local_range[1]) {
Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
if (xMode != 0)
global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
local_range[0] = static_cast<Index>(max_workgroup_Size /
(local_range[1] * local_range[2]));
input_range[0] = input_dim[0];
if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
global_range[0] = input_range[0];
if (local_range[0] > global_range[0])
local_range[0] = global_range[0];
else if (global_range[0] > local_range[0]) {
Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
if (xMode != 0)
global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
EIGEN_STRONG_INLINE bool has_local_memory() const {
#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
return false;
#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
return true;
#else
return m_device_info.local_mem_type ==
cl::sycl::info::local_mem_type::local;
#endif
}
EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
return m_device_info.max_mem_alloc_size;
}
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
return m_device_info.max_compute_units;
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
return m_device_info.max_work_group_size;
}
EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
return m_device_info.max_work_item_sizes;
}
/// No need for sycl it should act the same as CPU version
EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
// OpenCL does not have such a concept
return 2;
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
return m_device_info.local_mem_size;
}
// This function returns the nearest power of 2 Work-group size which is <=
// maximum device workgroup size.
EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
return getPowerOfTwo(m_device_info.max_work_group_size, false);
}
EIGEN_STRONG_INLINE std::string getPlatformName() const {
return m_device_info.platform_name;
}
EIGEN_STRONG_INLINE std::string getDeviceName() const {
return m_device_info.device_name;
}
EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
return m_device_info.device_vendor;
}
// This function returns the nearest power of 2
// if roundup is true returns result>=wgsize
// else it return result <= wgsize
EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
if (roundUp) --wGSize;
wGSize |= (wGSize >> 1);
wGSize |= (wGSize >> 2);
wGSize |= (wGSize >> 4);
wGSize |= (wGSize >> 8);
wGSize |= (wGSize >> 16);
#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
wGSize |= (wGSize >> 32);
#endif
return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
}
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
// This function checks if the runtime recorded an error for the
// underlying stream device.
EIGEN_STRONG_INLINE bool ok() const {
if (!exception_caught_) {
synchronize();
}
return !exception_caught_;
}
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
std::lock_guard<std::mutex> lock(event_mutex_);
return latest_events_[std::this_thread::get_id()];
#else
eigen_assert(false);
return cl::sycl::event();
#endif
}
// destructor
~QueueInterface() {
pMapper.clear();
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.clear();
#endif
}
protected:
EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
std::lock_guard<std::mutex> lock(event_mutex_);
latest_events_[std::this_thread::get_id()] = e;
#else
EIGEN_UNUSED_VARIABLE(e);
#endif
}
void synchronize_and_callback(cl::sycl::event e,
const std::function<void()> &callback) const {
set_latest_event(e);
if (callback) {
auto callback_ = [=]() {
#ifdef EIGEN_EXCEPTIONS
cl::sycl::event(e).wait_and_throw();
#else
cl::sycl::event(e).wait();
#endif
callback();
};
m_thread_pool.Schedule(std::move(callback_));
} else {
#ifdef EIGEN_EXCEPTIONS
m_queue.wait_and_throw();
#else
m_queue.wait();
#endif
}
}
bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
bool exception_caught = false;
for (const auto &e : exceptions) {
if (e) {
exception_caught = true;
EIGEN_THROW_X(e);
}
}
return exception_caught;
}
/// class members:
bool exception_caught_ = false;
mutable std::mutex pmapper_mutex_;
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
mutable std::mutex event_mutex_;
mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
#endif
/// std::map is the container used to make sure that we create only one buffer
/// per pointer. The lifespan of the buffer now depends on the lifespan of
/// SyclDevice. If a non-read-only pointer is needed to be accessed on the
/// host we should manually deallocate it.
mutable TensorSycl::internal::PointerMapper pMapper;
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
mutable std::unordered_set<void *> scratch_buffers;
#endif
/// sycl queue
mutable cl::sycl::queue m_queue;
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
mutable cl::sycl::program m_prog;
#endif
/// The thread pool is used to wait on events and call callbacks
/// asynchronously
mutable Eigen::ThreadPool m_thread_pool;
const TensorSycl::internal::SyclDeviceInfo m_device_info;
};
struct SyclDeviceBase {
/// QueueInterface is not owned. it is the caller's responsibility to destroy
/// it
const QueueInterface *m_queue_stream;
explicit SyclDeviceBase(const QueueInterface *queue_stream)
: m_queue_stream(queue_stream) {}
EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
return m_queue_stream;
}
};
// Here is a sycl device struct which accept the sycl queue interface
// as an input
struct SyclDevice : public SyclDeviceBase {
explicit SyclDevice(const QueueInterface *queue_stream)
: SyclDeviceBase(queue_stream) {}
// this is the accessor used to construct the evaluator
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessor(const void *ptr) const {
return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
}
// get sycl accessor
template <cl::sycl::access::mode AcMd>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
}
/// Accessing the created sycl device buffer for the device pointer
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
const void *ptr) const {
return queue_stream()->get_sycl_buffer(ptr);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
Index &rng, Index &GRange) const {
queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
cl::sycl::range<2> &local_range) const {
queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
cl::sycl::range<3> &local_range) const {
queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// allocate device memory
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
return queue_stream()->allocate(num_bytes);
}
EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
return queue_stream()->allocate_temp(num_bytes);
}
/// deallocate device memory
EIGEN_STRONG_INLINE void deallocate(void *p) const {
queue_stream()->deallocate(p);
}
EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
queue_stream()->deallocate_temp(buffer);
}
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE void deallocate_temp(
const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
queue_stream()->deallocate_temp(buffer);
}
EIGEN_STRONG_INLINE void deallocate_all() const {
queue_stream()->deallocate_all();
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, data_t>
get(data_t *data) const {
return queue_stream()->get(data);
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
data_t>
data) const {
return queue_stream()->get(data);
}
/// attach existing buffer
EIGEN_STRONG_INLINE void *attach_buffer(
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
return queue_stream()->attach_buffer(buf);
}
/// detach buffer
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
queue_stream()->detach_buffer(p);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
return queue_stream()->get_offset(ptr);
}
// some runtime conditions that can be applied here
EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
/// memcpyHostToDevice
template <typename Index>
EIGEN_STRONG_INLINE void memcpyHostToDevice(
Index *dst, const Index *src, size_t n,
std::function<void()> callback = {}) const {
queue_stream()->memcpyHostToDevice(dst, src, n, callback);
}
/// memcpyDeviceToHost
template <typename Index>
EIGEN_STRONG_INLINE void memcpyDeviceToHost(
void *dst, const Index *src, size_t n,
std::function<void()> callback = {}) const {
queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
}
/// the memcpy function
template <typename Index>
EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
queue_stream()->memcpy(dst, src, n);
}
/// the memset function
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
queue_stream()->memset(data, c, n);
}
/// the fill function
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
queue_stream()->fill(begin, end, value);
}
/// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
return queue_stream()->sycl_queue();
}
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
EIGEN_STRONG_INLINE cl::sycl::program &program() const {
return queue_stream()->program();
}
#endif
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on sycl devices.
return firstLevelCacheSize();
}
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
return queue_stream()->getNumSyclMultiProcessors();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
return queue_stream()->maxSyclThreadsPerBlock();
}
EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
return queue_stream()->maxWorkItemSizes();
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
// OpenCL does not have such a concept
return queue_stream()->maxSyclThreadsPerMultiProcessor();
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
return queue_stream()->sharedMemPerBlock();
}
EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
}
EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
return queue_stream()->getPowerOfTwo(val, roundUp);
}
/// No need for sycl it should act the same as CPU version
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return queue_stream()->majorDeviceVersion();
}
EIGEN_STRONG_INLINE void synchronize() const {
queue_stream()->synchronize();
}
EIGEN_STRONG_INLINE void async_synchronize(
cl::sycl::event e = cl::sycl::event()) const {
queue_stream()->async_synchronize(e);
}
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
return queue_stream()->get_latest_event();
}
// This function checks if the runtime recorded an error for the
// underlying stream device.
EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
EIGEN_STRONG_INLINE bool has_local_memory() const {
return queue_stream()->has_local_memory();
}
EIGEN_STRONG_INLINE long max_buffer_size() const {
return queue_stream()->max_buffer_size();
}
EIGEN_STRONG_INLINE std::string getPlatformName() const {
return queue_stream()->getPlatformName();
}
EIGEN_STRONG_INLINE std::string getDeviceName() const {
return queue_stream()->getDeviceName();
}
EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
return queue_stream()->getDeviceVendor();
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
var...);
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
var...);
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
var...);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H