Blame - bench/tensors/tensor_benchmarks.h - eigen

blob: 0825e156337bf7b5b4cd99250fd1db39f3200cad [file] [log] [blame]

Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	1	#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
				2	#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
				3
				4	typedef int TensorIndex;
				5	#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
				6
				7	#include "unsupported/Eigen/CXX11/Tensor"
				8	#include "benchmark.h"
				9
				10	#define BENCHMARK_RANGE(bench, lo, hi) \
				11	BENCHMARK(bench)->Range(lo, hi)
				12
				13	using Eigen::Tensor;
				14	using Eigen::TensorMap;
				15
				16	// TODO(bsteiner): also templatize on the input type since we have users
				17	// for int8 as well as floats.
				18	template <typename Device, typename T> class BenchmarkSuite {
				19	public:
				20	BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
				21	: m_(m), k_(k), n_(n), device_(device) {
				22	initialize();
				23	}
				24
				25	BenchmarkSuite(const Device& device, size_t m)
				26	: m_(m), k_(m), n_(m), device_(device) {
				27	initialize();
				28	}
				29
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	30	BenchmarkSuite(const Device& device, size_t m, size_t k)
				31	: m_(1), k_(k), n_(m), device_(device) {
				32	initialize();
				33	}
				34
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	35	~BenchmarkSuite() {
				36	device_.deallocate(a_);
				37	device_.deallocate(b_);
				38	device_.deallocate(c_);
				39	}
				40
				41	void memcpy(int num_iters) {
				42	eigen_assert(m_ == k_ && k_ == n_);
				43	#ifdef EIGEN_USE_SYCL // warmup for sycl
				44	for (int iter = 0; iter < 10; ++iter) {
				45	device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
				46	}
				47	#endif
				48	StartBenchmarkTiming();
				49	for (int iter = 0; iter < num_iters; ++iter) {
				50	device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
				51	}
				52	// Record the number of values copied per second
				53	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				54	}
				55
				56	void typeCasting(int num_iters) {
				57	eigen_assert(m_ == n_);
				58	Eigen::array<TensorIndex, 2> sizes;
				59	if (sizeof(T) >= sizeof(int)) {
				60	sizes[0] = m_;
				61	sizes[1] = k_;
				62	} else {
				63	sizes[0] = m_ * sizeof(T) / sizeof(int);
				64	sizes[1] = k_ * sizeof(T) / sizeof(int);
				65	}
				66	const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
				67	TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
				68	#ifdef EIGEN_USE_SYCL // warmup for sycl
				69	for (int iter = 0; iter < 10; ++iter) {
				70	B.device(device_) = A.template cast<T>();
				71	}
				72	#endif
				73	StartBenchmarkTiming();
				74	for (int iter = 0; iter < num_iters; ++iter) {
				75	B.device(device_) = A.template cast<T>();
				76	}
				77	// Record the number of values copied per second
				78	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				79	}
				80
				81	void random(int num_iters) {
				82	eigen_assert(m_ == k_ && k_ == n_);
				83	Eigen::array<TensorIndex, 2> sizes;
				84	sizes[0] = m_;
				85	sizes[1] = m_;
				86	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	87	#ifdef EIGEN_USE_SYCL // warmup for sycl
				88	for (int iter = 0; iter < 10; ++iter) {
				89	C.device(device_) = C.random();
				90	}
				91	#endif
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	92	StartBenchmarkTiming();
				93	for (int iter = 0; iter < num_iters; ++iter) {
				94	C.device(device_) = C.random();
				95	}
				96	// Record the number of random numbers generated per second
				97	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				98	}
				99
				100	void slicing(int num_iters) {
				101	eigen_assert(m_ == k_ && k_ == n_);
				102	Eigen::array<TensorIndex, 2> sizes;
				103	sizes[0] = m_;
				104	sizes[1] = m_;
				105	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				106	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				107	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				108
				109	const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
				110	const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
				111	const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
				112	const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
				113	const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
				114	#ifdef EIGEN_USE_SYCL // warmup for sycl
				115	for (int iter = 0; iter < 10; ++iter) {
				116	C.slice(first_quadrant, quarter_sizes).device(device_) =
				117	A.slice(first_quadrant, quarter_sizes);
				118	C.slice(second_quadrant, quarter_sizes).device(device_) =
				119	B.slice(second_quadrant, quarter_sizes);
				120	C.slice(third_quadrant, quarter_sizes).device(device_) =
				121	A.slice(third_quadrant, quarter_sizes);
				122	C.slice(fourth_quadrant, quarter_sizes).device(device_) =
				123	B.slice(fourth_quadrant, quarter_sizes);
				124	}
				125	#endif
				126	StartBenchmarkTiming();
				127	for (int iter = 0; iter < num_iters; ++iter) {
				128	C.slice(first_quadrant, quarter_sizes).device(device_) =
				129	A.slice(first_quadrant, quarter_sizes);
				130	C.slice(second_quadrant, quarter_sizes).device(device_) =
				131	B.slice(second_quadrant, quarter_sizes);
				132	C.slice(third_quadrant, quarter_sizes).device(device_) =
				133	A.slice(third_quadrant, quarter_sizes);
				134	C.slice(fourth_quadrant, quarter_sizes).device(device_) =
				135	B.slice(fourth_quadrant, quarter_sizes);
				136	}
				137	// Record the number of values copied from the rhs slice to the lhs slice
				138	// each second
				139	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				140	}
				141
				142	void rowChip(int num_iters) {
				143	Eigen::array<TensorIndex, 2> input_size;
				144	input_size[0] = k_;
				145	input_size[1] = n_;
				146	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				147	Eigen::array<TensorIndex, 1> output_size;
				148	output_size[0] = n_;
				149	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				150	#ifdef EIGEN_USE_SYCL // warmup for sycl
				151	for (int iter = 0; iter < 10; ++iter) {
				152	C.device(device_) = B.chip(iter % k_, 0);
				153	}
				154	#endif
				155	StartBenchmarkTiming();
				156	for (int iter = 0; iter < num_iters; ++iter) {
				157	C.device(device_) = B.chip(iter % k_, 0);
				158	}
				159	// Record the number of values copied from the rhs chip to the lhs.
				160	finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
				161	}
				162
				163	void colChip(int num_iters) {
				164	Eigen::array<TensorIndex, 2> input_size;
				165	input_size[0] = k_;
				166	input_size[1] = n_;
				167	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				168	Eigen::array<TensorIndex, 1> output_size;
				169	output_size[0] = n_;
				170	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				171	#ifdef EIGEN_USE_SYCL // warmup for sycl
				172	for (int iter = 0; iter < 10; ++iter) {
				173	C.device(device_) = B.chip(iter % n_, 1);
				174	}
				175	#endif
				176	StartBenchmarkTiming();
				177	for (int iter = 0; iter < num_iters; ++iter) {
				178	C.device(device_) = B.chip(iter % n_, 1);
				179	}
				180	// Record the number of values copied from the rhs chip to the lhs.
				181	finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
				182	}
				183
				184	void shuffling(int num_iters) {
				185	eigen_assert(m_ == n_);
				186	Eigen::array<TensorIndex, 2> size_a;
				187	size_a[0] = m_;
				188	size_a[1] = k_;
				189	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				190	Eigen::array<TensorIndex, 2> size_b;
				191	size_b[0] = k_;
				192	size_b[1] = m_;
				193	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				194
				195	Eigen::array<int, 2> shuffle;
				196	shuffle[0] = 1;
				197	shuffle[1] = 0;
				198	#ifdef EIGEN_USE_SYCL // warmup for sycl
				199	for (int iter = 0; iter < 10; ++iter) {
				200	B.device(device_) = A.shuffle(shuffle);
				201	}
				202	#endif
				203	StartBenchmarkTiming();
				204	for (int iter = 0; iter < num_iters; ++iter) {
				205	B.device(device_) = A.shuffle(shuffle);
				206	}
				207	// Record the number of values shuffled from A and copied to B each second
				208	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				209	}
				210
				211	void padding(int num_iters) {
				212	eigen_assert(m_ == k_);
				213	Eigen::array<TensorIndex, 2> size_a;
				214	size_a[0] = m_;
				215	size_a[1] = k_-3;
				216	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				217	Eigen::array<TensorIndex, 2> size_b;
				218	size_b[0] = k_;
				219	size_b[1] = m_;
				220	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				221
				222	#if defined(EIGEN_HAS_INDEX_LIST)
				223	Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
				224	Eigen::type2indexpair<2, 1> > paddings;
				225	#else
				226	Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
				227	paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
				228	paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
				229	#endif
				230	#ifdef EIGEN_USE_SYCL // warmup for sycl
				231	for (int iter = 0; iter < 10; ++iter) {
				232	B.device(device_) = A.pad(paddings);
				233	}
				234	#endif
				235	StartBenchmarkTiming();
				236	for (int iter = 0; iter < num_iters; ++iter) {
				237	B.device(device_) = A.pad(paddings);
				238	}
				239	// Record the number of values copied from the padded tensor A each second
				240	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				241	}
				242
				243	void striding(int num_iters) {
				244	eigen_assert(m_ == k_);
				245	Eigen::array<TensorIndex, 2> size_a;
				246	size_a[0] = m_;
				247	size_a[1] = k_;
				248	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				249	Eigen::array<TensorIndex, 2> size_b;
				250	size_b[0] = m_;
				251	size_b[1] = k_/2;
				252	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				253
				254	#ifndef EIGEN_HAS_INDEX_LIST
				255	Eigen::array<TensorIndex, 2> strides;
				256	strides[0] = 1;
				257	strides[1] = 2;
				258	#else
				259	// Take advantage of cxx11 to give the compiler information it can use to
				260	// optimize the code.
				261	Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
				262	#endif
				263
				264	#ifdef EIGEN_USE_SYCL // warmup for sycl
				265	for (int iter = 0; iter < 10; ++iter) {
				266	B.device(device_) = A.stride(strides);
				267	}
				268	#endif
				269	StartBenchmarkTiming();
				270	for (int iter = 0; iter < num_iters; ++iter) {
				271	B.device(device_) = A.stride(strides);
				272	}
				273	// Record the number of values copied from the padded tensor A each second
				274	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				275	}
				276
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	277
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	278	void broadcasting(int num_iters) {
				279	Eigen::array<TensorIndex, 2> size_a;
				280	size_a[0] = m_;
				281	size_a[1] = 1;
				282	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				283	Eigen::array<TensorIndex, 2> size_c;
				284	size_c[0] = m_;
				285	size_c[1] = n_;
				286	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
				287
				288	#ifndef EIGEN_HAS_INDEX_LIST
				289	Eigen::array<int, 2> broadcast;
				290	broadcast[0] = 1;
				291	broadcast[1] = n_;
				292	#else
				293	// Take advantage of cxx11 to give the compiler information it can use to
				294	// optimize the code.
				295	Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
				296	broadcast.set(1, n_);
				297	#endif
				298
				299	#ifdef EIGEN_USE_SYCL // warmup for sycl
				300	for (int iter = 0; iter < 10; ++iter) {
				301	C.device(device_) = A.broadcast(broadcast);
				302	}
				303	#endif
				304	StartBenchmarkTiming();
				305	for (int iter = 0; iter < num_iters; ++iter) {
				306	C.device(device_) = A.broadcast(broadcast);
				307	}
				308	// Record the number of values broadcasted from A and copied to C each second
				309	finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
				310	}
				311
				312	void coeffWiseOp(int num_iters) {
				313	eigen_assert(m_ == k_ && k_ == n_);
				314	Eigen::array<TensorIndex, 2> sizes;
				315	sizes[0] = m_;
				316	sizes[1] = m_;
				317	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				318	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				319	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				320	#ifdef EIGEN_USE_SYCL // warmup for sycl
				321	for (int iter = 0; iter < 10; ++iter) {
				322	C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
				323	}
				324	#endif
				325	StartBenchmarkTiming();
				326	for (int iter = 0; iter < num_iters; ++iter) {
				327	C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
				328	}
				329	// Record the number of FLOP executed per second (2 multiplications and
				330	// 1 addition per value)
				331	finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
				332	}
				333
				334	void algebraicFunc(int num_iters) {
				335	eigen_assert(m_ == k_ && k_ == n_);
				336	Eigen::array<TensorIndex, 2> sizes;
				337	sizes[0] = m_;
				338	sizes[1] = m_;
				339	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				340	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				341	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				342
				343	#ifdef EIGEN_USE_SYCL // warmup for sycl
				344	for (int iter = 0; iter < 10; ++iter) {
				345	C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
				346	}
				347	#endif
				348	StartBenchmarkTiming();
				349	for (int iter = 0; iter < num_iters; ++iter) {
				350	C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
				351	}
				352	// Record the number of FLOP executed per second (assuming one operation
				353	// per value)
				354	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				355	}
				356
				357	void transcendentalFunc(int num_iters) {
				358	eigen_assert(m_ == k_ && k_ == n_);
				359	Eigen::array<TensorIndex, 2> sizes;
				360	sizes[0] = m_;
				361	sizes[1] = m_;
				362	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				363	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				364	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				365	#ifdef EIGEN_USE_SYCL // warmup for sycl
				366	for (int iter = 0; iter < 10; ++iter) {
				367	C.device(device_) = A.exp() + B.log();
				368	}
				369	#endif
				370	StartBenchmarkTiming();
				371	for (int iter = 0; iter < num_iters; ++iter) {
				372	C.device(device_) = A.exp() + B.log();
				373	}
				374	// Record the number of FLOP executed per second (assuming one operation
				375	// per value)
				376	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				377	}
				378
				379	// Row reduction
				380	void rowReduction(int num_iters) {
				381	Eigen::array<TensorIndex, 2> input_size;
				382	input_size[0] = k_;
				383	input_size[1] = n_;
				384	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				385	Eigen::array<TensorIndex, 1> output_size;
				386	output_size[0] = n_;
				387	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				388
				389	#ifndef EIGEN_HAS_INDEX_LIST
				390	Eigen::array<TensorIndex, 1> sum_along_dim;
				391	sum_along_dim[0] = 0;
				392	#else
				393	// Take advantage of cxx11 to give the compiler information it can use to
				394	// optimize the code.
				395	Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
				396	#endif
				397	#ifdef EIGEN_USE_SYCL // warmup for sycl
				398	for (int iter = 0; iter < 10; ++iter) {
				399	C.device(device_) = B.sum(sum_along_dim);
				400	}
				401	#endif
				402	StartBenchmarkTiming();
				403	for (int iter = 0; iter < num_iters; ++iter) {
				404	C.device(device_) = B.sum(sum_along_dim);
				405	}
				406	// Record the number of FLOP executed per second (assuming one operation
				407	// per value)
				408	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				409	}
				410
				411	// Column reduction
				412	void colReduction(int num_iters) {
				413	Eigen::array<TensorIndex, 2> input_size;
				414	input_size[0] = k_;
				415	input_size[1] = n_;
				416	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
				417	b_, input_size);
				418	Eigen::array<TensorIndex, 1> output_size;
				419	output_size[0] = k_;
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	420	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
				421	a_, output_size);
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	422
				423	#ifndef EIGEN_HAS_INDEX_LIST
				424	Eigen::array<TensorIndex, 1> sum_along_dim;
				425	sum_along_dim[0] = 1;
				426	#else
				427	// Take advantage of cxx11 to give the compiler information it can use to
				428	// optimize the code.
				429	Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
				430	#endif
				431	#ifdef EIGEN_USE_SYCL // warmup for sycl
				432	for (int iter = 0; iter < 10; ++iter) {
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	433	A.device(device_) = B.sum(sum_along_dim);
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	434	}
				435	#endif
				436	StartBenchmarkTiming();
				437	for (int iter = 0; iter < num_iters; ++iter) {
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	438	A.device(device_) = B.sum(sum_along_dim);
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	439	}
				440	// Record the number of FLOP executed per second (assuming one operation
				441	// per value)
				442	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				443	}
				444
				445	// Full reduction
				446	void fullReduction(int num_iters) {
				447	Eigen::array<TensorIndex, 2> input_size;
				448	input_size[0] = k_;
				449	input_size[1] = n_;
				450	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
				451	b_, input_size);
				452	Eigen::array<TensorIndex, 0> output_size;
				453	TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
				454	c_, output_size);
				455	#ifdef EIGEN_USE_SYCL // warmup for sycl
				456	for (int iter = 0; iter < 10; ++iter) {
				457	C.device(device_) = B.sum();
				458	}
				459	#endif
				460	StartBenchmarkTiming();
				461	for (int iter = 0; iter < num_iters; ++iter) {
				462	C.device(device_) = B.sum();
				463	}
				464	// Record the number of FLOP executed per second (assuming one operation
				465	// per value)
				466	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				467	}
				468
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	469
				470
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	471	// do a contraction which is equivalent to a matrix multiplication
				472	void contraction(int num_iters) {
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	473	contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
				474	}
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	475
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	476	void contractionRowMajor(int num_iters) {
				477	contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
				478	}
				479
				480	void contractionRowMajorAT(int num_iters) {
				481	contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
				482	}
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	483
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	484	void contractionRowMajorBT(int num_iters) {
				485	contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
				486	}
				487
				488	void contractionRowMajorABT(int num_iters) {
				489	contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	490	}
				491
				492	void convolution(int num_iters, int kernel_x, int kernel_y) {
				493	Eigen::array<TensorIndex, 2> input_sizes;
				494	input_sizes[0] = m_;
				495	input_sizes[1] = n_;
				496	TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
				497	Eigen::array<TensorIndex, 2> kernel_sizes;
				498	kernel_sizes[0] = kernel_x;
				499	kernel_sizes[1] = kernel_y;
				500	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
				501	Eigen::array<TensorIndex, 2> result_sizes;
				502	result_sizes[0] = m_ - kernel_x + 1;
				503	result_sizes[1] = n_ - kernel_y + 1;
				504	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
				505	Eigen::array<TensorIndex, 2> dims;
				506	dims[0] = 0;
				507	dims[1] = 1;
				508	#ifdef EIGEN_USE_SYCL // warmup for sycl
				509	for (int iter = 0; iter < 10; ++iter) {
				510	C.device(device_) = A.convolve(B, dims);
				511	}
				512	#endif
				513	StartBenchmarkTiming();
				514	for (int iter = 0; iter < num_iters; ++iter) {
				515	C.device(device_) = A.convolve(B, dims);
				516	}
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	517	// Record the number of FLOPs executed per second (kernel_size
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	518	// multiplications and additions for each value in the resulting tensor)
				519	finalizeBenchmark(static_cast<int64_t>(2) *
				520	(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
				521	}
				522
				523	private:
Rasmus Munk Larsen	32c1f1f	2019-12-06 11:42:25 -0800	[diff] [blame^]	524	// do a contraction which is equivalent to a matrix multiplication
				525	template<int Layout>
				526	void contraction(int num_iters, bool trans_a, bool trans_b) {
				527	Eigen::array<TensorIndex, 2> sizeA;
				528	sizeA[0] = (trans_a ? k_: m_);
				529	sizeA[1] = (trans_a ? m_: k_);
				530	Eigen::array<TensorIndex, 2> sizeB;
				531	sizeB[0] = (trans_b ? n_: k_);
				532	sizeB[1] = (trans_b ? k_: n_);
				533	Eigen::array<TensorIndex, 2> sizeC;
				534	sizeC[0] = m_;
				535	sizeC[1] = n_;
				536
				537	const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
				538	const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
				539	TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
				540
				541	typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
				542	Eigen::array<DimPair, 1> dims;
				543	TensorIndex a_contract_dim = (trans_a ? 0 : 1);
				544	TensorIndex b_contract_dim = (trans_b ? 1 : 0);
				545	dims[0] = DimPair(a_contract_dim, b_contract_dim);
				546	#ifdef EIGEN_USE_SYCL // warmup for sycl
				547	for (int iter = 0; iter < 10; ++iter) {
				548	C.device(device_) = A.contract(B, dims);
				549	}
				550	#endif
				551	StartBenchmarkTiming();
				552	for (int iter = 0; iter < num_iters; ++iter) {
				553	C.device(device_) = A.contract(B, dims);
				554	}
				555	// Record the number of FLOP executed per second (size_ multiplications and
				556	// additions for each value in the resulting tensor)
				557	finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
				558	}
				559
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	560	void initialize() {
				561	a_ = (T ) device_.allocate(m_ k_ * sizeof(T));
				562	b_ = (T ) device_.allocate(k_ n_ * sizeof(T));
				563	c_ = (T ) device_.allocate(m_ n_ * sizeof(T));
				564
				565	// Initialize the content of the memory pools to prevent asan from
				566	// complaining.
				567	device_.memset(a_, 12, m_ * k_ * sizeof(T));
				568	device_.memset(b_, 23, k_ * n_ * sizeof(T));
				569	device_.memset(c_, 31, m_ * n_ * sizeof(T));
				570
Googler	45874d8	2019-08-21 12:06:47 -0700	[diff] [blame]	571	}
				572
				573	inline void finalizeBenchmark(int64_t num_items) {
				574	#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
				575	if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
				576	device_.synchronize();
				577	}
				578	#elif defined(EIGEN_USE_SYCL)
				579	if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
				580	device_.synchronize();
				581	}
				582
				583	#endif
				584	StopBenchmarkTiming();
				585	SetBenchmarkFlopsProcessed(num_items);
				586	}
				587
				588
				589	TensorIndex m_;
				590	TensorIndex k_;
				591	TensorIndex n_;
				592	T* a_;
				593	T* b_;
				594	T* c_;
				595	Device device_;
				596	};
				597	#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_