| #ifndef GPU_TEST_HELPER_H |
| #define GPU_TEST_HELPER_H |
| |
| #include <Eigen/Core> |
| |
| // Allow gpu** macros for generic tests. |
| #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> |
| |
| // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that |
| // doesn't allow std::tuple to compile for host code either. In these cases, |
| // use our custom implementation. |
| #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) |
| #define EIGEN_USE_CUSTOM_TUPLE 1 |
| #else |
| #define EIGEN_USE_CUSTOM_TUPLE 0 |
| #endif |
| |
| #if EIGEN_USE_CUSTOM_TUPLE |
| #include "../Eigen/src/Core/arch/GPU/Tuple.h" |
| #else |
| #include <tuple> |
| #endif |
| namespace Eigen { |
| |
| namespace internal { |
| |
| // Note: cannot re-use tuple_impl, since that will cause havoc for |
| // tuple_test. |
| namespace test_detail { |
| // Use std::tuple on CPU, otherwise use the GPU-specific versions. |
| #if !EIGEN_USE_CUSTOM_TUPLE |
| using std::tuple; |
| using std::get; |
| using std::make_tuple; |
| using std::tie; |
| #else |
| using tuple_impl::tuple; |
| using tuple_impl::get; |
| using tuple_impl::make_tuple; |
| using tuple_impl::tie; |
| #endif |
| #undef EIGEN_USE_CUSTOM_TUPLE |
| } // namespace test_detail |
| |
| template<size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts> |
| struct extract_output_indices_helper; |
| |
| /** |
| * Extracts a set of indices corresponding to non-const l-value reference |
| * output types. |
| * |
| * \internal |
| * \tparam N the number of types {T1, Ts...}. |
| * \tparam Idx the "index" to append if T1 is an output type. |
| * \tparam OutputIndices the current set of output indices. |
| * \tparam T1 the next type to consider, with index Idx. |
| * \tparam Ts the remaining types. |
| */ |
| template<size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts> |
| struct extract_output_indices_helper<N, Idx, std::index_sequence<OutputIndices...>, T1, Ts...> { |
| using type = typename |
| extract_output_indices_helper< |
| N - 1, Idx + 1, |
| typename std::conditional< |
| // If is a non-const l-value reference, append index. |
| std::is_lvalue_reference<T1>::value |
| && !std::is_const<std::remove_reference_t<T1>>::value, |
| std::index_sequence<OutputIndices..., Idx>, |
| std::index_sequence<OutputIndices...> >::type, |
| Ts...>::type; |
| }; |
| |
| // Base case. |
| template<size_t Idx, size_t... OutputIndices> |
| struct extract_output_indices_helper<0, Idx, std::index_sequence<OutputIndices...> > { |
| using type = std::index_sequence<OutputIndices...>; |
| }; |
| |
| // Extracts a set of indices into Types... that correspond to non-const |
| // l-value references. |
| template<typename... Types> |
| using extract_output_indices = typename extract_output_indices_helper<sizeof...(Types), 0, std::index_sequence<>, Types...>::type; |
| |
| // Helper struct for dealing with Generic functors that may return void. |
| struct void_helper { |
| struct Void {}; |
| |
| // Converts void -> Void, T otherwise. |
| template<typename T> |
| using ReturnType = typename std::conditional<std::is_same<T, void>::value, Void, T>::type; |
| |
| // Non-void return value. |
| template<typename Func, typename... Args> |
| static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC |
| auto call(Func&& func, Args&&... args) -> |
| std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, |
| decltype(func(args...))> { |
| return func(std::forward<Args>(args)...); |
| } |
| |
| // Void return value. |
| template<typename Func, typename... Args> |
| static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC |
| auto call(Func&& func, Args&&... args) -> |
| std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, |
| Void> { |
| func(std::forward<Args>(args)...); |
| return Void{}; |
| } |
| |
| // Restores the original return type, Void -> void, T otherwise. |
| template<typename T> |
| static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC |
| std::enable_if_t<!std::is_same<typename std::decay<T>::type, Void>::value, T> |
| restore(T&& val) { |
| return val; |
| } |
| |
| // Void case. |
| template<typename T = void> |
| static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC |
| void restore(const Void&) {} |
| }; |
| |
| // Runs a kernel via serialized buffer. Does this by deserializing the buffer |
| // to construct the arguments, calling the kernel, then re-serialing the outputs. |
| // The buffer contains |
| // [ input_buffer_size, args ] |
| // After the kernel call, it is then populated with |
| // [ output_buffer_size, output_parameters, return_value ] |
| // If the output_buffer_size exceeds the buffer's capacity, then only the |
| // output_buffer_size is populated. |
| template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices> |
| EIGEN_DEVICE_FUNC |
| void run_serialized(std::index_sequence<Indices...>, std::index_sequence<OutputIndices...>, |
| Kernel kernel, uint8_t* buffer, size_t capacity) { |
| using test_detail::get; |
| using test_detail::make_tuple; |
| using test_detail::tuple; |
| // Deserialize input size and inputs. |
| size_t input_size; |
| const uint8_t* read_ptr = buffer; |
| const uint8_t* read_end = buffer + capacity; |
| read_ptr = Eigen::deserialize(read_ptr, read_end, input_size); |
| // Create value-type instances to populate. |
| auto args = make_tuple(typename std::decay<Args>::type{}...); |
| EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning. |
| // NVCC 9.1 requires us to spell out the template parameters explicitly. |
| read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...); |
| |
| // Call function, with void->Void conversion so we are guaranteed a complete |
| // output type. |
| auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...); |
| |
| // Determine required output size. |
| size_t output_size = Eigen::serialize_size(capacity); |
| output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...); |
| output_size += Eigen::serialize_size(result); |
| |
| // Always serialize required buffer size. |
| uint8_t* write_ptr = buffer; |
| uint8_t* write_end = buffer + capacity; |
| write_ptr = Eigen::serialize(write_ptr, write_end, output_size); |
| // Null `write_ptr` can be safely passed along. |
| // Serialize outputs if they fit in the buffer. |
| if (output_size <= capacity) { |
| // Collect outputs and result. |
| write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...); |
| write_ptr = Eigen::serialize(write_ptr, write_end, result); |
| } |
| } |
| |
| template<typename Kernel, typename... Args> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE |
| void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) { |
| run_serialized<Kernel, Args...> (std::make_index_sequence<sizeof...(Args)>{}, |
| extract_output_indices<Args...>{}, |
| kernel, buffer, capacity); |
| } |
| |
| #ifdef EIGEN_GPUCC |
| |
| // Checks for GPU errors and asserts / prints the error message. |
| #define GPU_CHECK(expr) \ |
| do { \ |
| gpuError_t err = expr; \ |
| if (err != gpuSuccess) { \ |
| printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \ |
| gpu_assert(false); \ |
| } \ |
| } while(0) |
| |
| // Calls run_serialized on the GPU. |
| template<typename Kernel, typename... Args> |
| __global__ |
| EIGEN_HIP_LAUNCH_BOUNDS_1024 |
| void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) { |
| run_serialized<Kernel, Args...>(kernel, buffer, capacity); |
| } |
| |
| // Runs kernel(args...) on the GPU via the serialization mechanism. |
| // |
| // Note: this may end up calling the kernel multiple times if the initial output |
| // buffer is not large enough to hold the outputs. |
| template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices> |
| auto run_serialized_on_gpu(size_t buffer_capacity_hint, |
| std::index_sequence<Indices...>, |
| std::index_sequence<OutputIndices...>, |
| Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { |
| // Compute the required serialization buffer capacity. |
| // Round up input size to next power of two to give a little extra room |
| // for outputs. |
| size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...); |
| |
| size_t capacity; |
| if (buffer_capacity_hint == 0) { |
| // Estimate as the power of two larger than the total input size. |
| capacity = sizeof(size_t); |
| while (capacity <= input_data_size) { |
| capacity *= 2; |
| } |
| } else { |
| // Use the larger of the hint and the total input size. |
| // Add sizeof(size_t) to the hint to account for storing the buffer capacity |
| // itself so the user doesn't need to think about this. |
| capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t), |
| input_data_size); |
| } |
| std::vector<uint8_t> buffer(capacity); |
| |
| uint8_t* host_data = nullptr; |
| uint8_t* host_data_end = nullptr; |
| uint8_t* host_ptr = nullptr; |
| uint8_t* device_data = nullptr; |
| size_t output_data_size = 0; |
| |
| // Allocate buffers and copy input data. |
| capacity = std::max<size_t>(capacity, output_data_size); |
| buffer.resize(capacity); |
| host_data = buffer.data(); |
| host_data_end = buffer.data() + capacity; |
| host_ptr = Eigen::serialize(host_data, host_data_end, input_data_size); |
| host_ptr = Eigen::serialize(host_ptr, host_data_end, args...); |
| |
| // Copy inputs to host. |
| gpuMalloc((void**)(&device_data), capacity); |
| gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice); |
| GPU_CHECK(gpuDeviceSynchronize()); |
| |
| // Run kernel. |
| #ifdef EIGEN_USE_HIP |
| hipLaunchKernelGGL( |
| HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), |
| 1, 1, 0, 0, kernel, device_data, capacity); |
| #else |
| run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1,1>>>( |
| kernel, device_data, capacity); |
| #endif |
| // Check pre-launch and kernel execution errors. |
| GPU_CHECK(gpuGetLastError()); |
| GPU_CHECK(gpuDeviceSynchronize()); |
| // Copy back new output to host. |
| gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost); |
| gpuFree(device_data); |
| GPU_CHECK(gpuDeviceSynchronize()); |
| |
| // Determine output buffer size. |
| const uint8_t* c_host_ptr = Eigen::deserialize(host_data, host_data_end, output_data_size); |
| // If the output doesn't fit in the buffer, spit out warning and fail. |
| if (output_data_size > capacity) { |
| std::cerr << "The serialized output does not fit in the output buffer, " |
| << output_data_size << " vs capacity " << capacity << "." |
| << std::endl |
| << "Try specifying a minimum buffer capacity: " << std::endl |
| << " run_with_hint(" << output_data_size << ", ...)" |
| << std::endl; |
| VERIFY(false); |
| } |
| |
| // Deserialize outputs. |
| auto args_tuple = test_detail::tie(args...); |
| EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning. |
| c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get<OutputIndices, Args&...>(args_tuple)...); |
| |
| // Maybe deserialize return value, properly handling void. |
| typename void_helper::ReturnType<decltype(kernel(args...))> result; |
| c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, result); |
| return void_helper::restore(result); |
| } |
| |
| #endif // EIGEN_GPUCC |
| |
| } // namespace internal |
| |
| /** |
| * Runs a kernel on the CPU, returning the results. |
| * \param kernel kernel to run. |
| * \param args ... input arguments. |
| * \return kernel(args...). |
| */ |
| template<typename Kernel, typename... Args> |
| auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ |
| return kernel(std::forward<Args>(args)...); |
| } |
| |
| #ifdef EIGEN_GPUCC |
| |
| /** |
| * Runs a kernel on the GPU, returning the results. |
| * |
| * The kernel must be able to be passed directly as an input to a global |
| * function (i.e. empty or POD). Its inputs must be "Serializable" so we |
| * can transfer them to the device, and the output must be a Serializable value |
| * type so it can be transferred back from the device. |
| * |
| * \param kernel kernel to run. |
| * \param args ... input arguments, must be "Serializable". |
| * \return kernel(args...). |
| */ |
| template<typename Kernel, typename... Args> |
| auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ |
| return internal::run_serialized_on_gpu<Kernel, Args...>( |
| /*buffer_capacity_hint=*/ 0, |
| std::make_index_sequence<sizeof...(Args)>{}, |
| internal::extract_output_indices<Args...>{}, |
| kernel, std::forward<Args>(args)...); |
| } |
| |
| /** |
| * Runs a kernel on the GPU, returning the results. |
| * |
| * This version allows specifying a minimum buffer capacity size required for |
| * serializing the puts to transfer results from device to host. Use this when |
| * `run_on_gpu(...)` fails to determine an appropriate capacity by default. |
| * |
| * \param buffer_capacity_hint minimum required buffer size for serializing |
| * outputs. |
| * \param kernel kernel to run. |
| * \param args ... input arguments, must be "Serializable". |
| * \return kernel(args...). |
| * \sa run_on_gpu |
| */ |
| template<typename Kernel, typename... Args> |
| auto run_on_gpu_with_hint(size_t buffer_capacity_hint, |
| Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ |
| return internal::run_serialized_on_gpu<Kernel, Args...>( |
| buffer_capacity_hint, |
| std::make_index_sequence<sizeof...(Args)>{}, |
| internal::extract_output_indices<Args...>{}, |
| kernel, std::forward<Args>(args)...); |
| } |
| |
| /** |
| * Kernel for determining basic Eigen compile-time information |
| * (i.e. the cuda/hip arch) |
| */ |
| struct CompileTimeDeviceInfoKernel { |
| struct Info { |
| int cuda; |
| int hip; |
| }; |
| |
| EIGEN_DEVICE_FUNC |
| Info operator()() const |
| { |
| Info info = {-1, -1}; |
| #if defined(__CUDA_ARCH__) |
| info.cuda = static_cast<int>(__CUDA_ARCH__ +0); |
| #endif |
| #if defined(EIGEN_HIP_DEVICE_COMPILE) |
| info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE +0); |
| #endif |
| return info; |
| } |
| }; |
| |
| /** |
| * Queries and prints the compile-time and runtime GPU info. |
| */ |
| void print_gpu_device_info() |
| { |
| int device = 0; |
| gpuDeviceProp_t deviceProp; |
| gpuGetDeviceProperties(&deviceProp, device); |
| |
| auto info = run_on_gpu(CompileTimeDeviceInfoKernel()); |
| |
| std::cout << "GPU compile-time info:\n"; |
| |
| #ifdef EIGEN_CUDACC |
| std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl; |
| #endif |
| |
| #ifdef EIGEN_CUDA_SDK_VER |
| std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl; |
| #endif |
| |
| #if EIGEN_COMP_NVCC |
| std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl; |
| #endif |
| |
| #ifdef EIGEN_HIPCC |
| std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl; |
| #endif |
| |
| std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl; |
| std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl; |
| |
| std::cout << "GPU device info:\n"; |
| std::cout << " name: " << deviceProp.name << std::endl; |
| std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; |
| std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl; |
| std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; |
| std::cout << " warpSize: " << deviceProp.warpSize << std::endl; |
| std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl; |
| std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl; |
| std::cout << " clockRate: " << deviceProp.clockRate << std::endl; |
| std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl; |
| std::cout << " computeMode: " << deviceProp.computeMode << std::endl; |
| } |
| |
| #endif // EIGEN_GPUCC |
| |
| /** |
| * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. |
| * |
| * This is to better support creating generic tests. |
| * |
| * The kernel must be able to be passed directly as an input to a global |
| * function (i.e. empty or POD). Its inputs must be "Serializable" so we |
| * can transfer them to the device, and the output must be a Serializable value |
| * type so it can be transferred back from the device. |
| * |
| * \param kernel kernel to run. |
| * \param args ... input arguments, must be "Serializable". |
| * \return kernel(args...). |
| */ |
| template<typename Kernel, typename... Args> |
| auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ |
| #ifdef EIGEN_GPUCC |
| return run_on_gpu(kernel, std::forward<Args>(args)...); |
| #else |
| return run_on_cpu(kernel, std::forward<Args>(args)...); |
| #endif |
| } |
| |
| /** |
| * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. |
| * |
| * This version allows specifying a minimum buffer capacity size required for |
| * serializing the puts to transfer results from device to host. Use this when |
| * `run(...)` fails to determine an appropriate capacity by default. |
| * |
| * \param buffer_capacity_hint minimum required buffer size for serializing |
| * outputs. |
| * \param kernel kernel to run. |
| * \param args ... input arguments, must be "Serializable". |
| * \return kernel(args...). |
| * \sa run |
| */ |
| template<typename Kernel, typename... Args> |
| auto run_with_hint(size_t buffer_capacity_hint, |
| Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ |
| #ifdef EIGEN_GPUCC |
| return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...); |
| #else |
| EIGEN_UNUSED_VARIABLE(buffer_capacity_hint) |
| return run_on_cpu(kernel, std::forward<Args>(args)...); |
| #endif |
| } |
| |
| } // namespace Eigen |
| |
| #endif // GPU_TEST_HELPER_H |