unsupported/benchmarks/Tensor/bench_reduction.cpp - eigen - Git at Google

 // Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
 // Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.

 #define EIGEN_USE_THREADS

 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/ThreadPool>

 using namespace Eigen;

 #ifndef SCALAR
 #define SCALAR float
 #endif

 typedef SCALAR Scalar;

 // --- Full reduction (rank-2) ---
 template <typename ReduceOp>
 static void BM_FullReduction(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   A.setRandom();

   for (auto _ : state) {
     Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }

 // --- Partial reduction along dim 0 (inner dim, ColMajor) ---
 static void BM_ReduceInner(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   A.setRandom();

   Eigen::array<int, 1> reduce_dims = {0};

   for (auto _ : state) {
     Tensor<Scalar, 1> result = A.sum(reduce_dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }

 // --- Partial reduction along dim 1 (outer dim, ColMajor) ---
 static void BM_ReduceOuter(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   A.setRandom();

   Eigen::array<int, 1> reduce_dims = {1};

   for (auto _ : state) {
     Tensor<Scalar, 1> result = A.sum(reduce_dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }

 // --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
 static void BM_ReduceSpatial(benchmark::State& state) {
   const int batch = state.range(0);
   const int C = state.range(1);
   const int H = state.range(2);

   Tensor<Scalar, 4> A(batch, C, H, H);
   A.setRandom();

   Eigen::array<int, 2> reduce_dims = {2, 3};

   for (auto _ : state) {
     Tensor<Scalar, 2> result = A.sum(reduce_dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
 }

 // --- Full reduction with ThreadPoolDevice ---
 static void BM_FullReduction_ThreadPool(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);
   const int threads = state.range(2);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 0> result;
   A.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   for (auto _ : state) {
     result.device(dev) = A.sum();
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
   state.counters["threads"] = threads;
 }

 // --- Maximum reduction (rank-2) ---
 static void BM_MaxReduction(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   A.setRandom();

   for (auto _ : state) {
     Tensor<Scalar, 0> result = A.maximum();
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }

 static void ReductionSizes(::benchmark::Benchmark* b) {
   for (int size : {64, 256, 1024}) {
     b->Args({size, size});
   }
 }

 static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
   for (int size : {256, 1024}) {
     for (int threads : {2, 4, 8}) {
       b->Args({size, size, threads});
     }
   }
 }

 static void SpatialSizes(::benchmark::Benchmark* b) {
   for (int batch : {1, 8, 32}) {
     for (int c : {64, 128}) {
       for (int h : {16, 32}) {
         b->Args({batch, c, h});
       }
     }
   }
 }

 BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
 BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
 BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
 BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
 BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
	// Benchmarks for Eigen Tensor reductions (sum, maximum, mean).
	// Tests full and partial reductions, inner vs outer dimension, DefaultDevice and ThreadPoolDevice.

	#define EIGEN_USE_THREADS

	#include <benchmark/benchmark.h>
	#include <unsupported/Eigen/CXX11/Tensor>
	#include <unsupported/Eigen/CXX11/ThreadPool>

	using namespace Eigen;

	#ifndef SCALAR
	#define SCALAR float
	#endif

	typedef SCALAR Scalar;

	// --- Full reduction (rank-2) ---
	template <typename ReduceOp>
	static void BM_FullReduction(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	A.setRandom();

	for (auto _ : state) {
	Tensor<Scalar, 0> result = A.reduce(Eigen::array<int, 2>{0, 1}, ReduceOp());
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
	}

	// --- Partial reduction along dim 0 (inner dim, ColMajor) ---
	static void BM_ReduceInner(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	A.setRandom();

	Eigen::array<int, 1> reduce_dims = {0};

	for (auto _ : state) {
	Tensor<Scalar, 1> result = A.sum(reduce_dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
	}

	// --- Partial reduction along dim 1 (outer dim, ColMajor) ---
	static void BM_ReduceOuter(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	A.setRandom();

	Eigen::array<int, 1> reduce_dims = {1};

	for (auto _ : state) {
	Tensor<Scalar, 1> result = A.sum(reduce_dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
	}

	// --- Rank-4 partial reduction (batch x channels x H x W), reduce along spatial dims ---
	static void BM_ReduceSpatial(benchmark::State& state) {
	const int batch = state.range(0);
	const int C = state.range(1);
	const int H = state.range(2);

	Tensor<Scalar, 4> A(batch, C, H, H);
	A.setRandom();

	Eigen::array<int, 2> reduce_dims = {2, 3};

	for (auto _ : state) {
	Tensor<Scalar, 2> result = A.sum(reduce_dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
	}

	// --- Full reduction with ThreadPoolDevice ---
	static void BM_FullReduction_ThreadPool(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);
	const int threads = state.range(2);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 0> result;
	A.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	for (auto _ : state) {
	result.device(dev) = A.sum();
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
	state.counters["threads"] = threads;
	}

	// --- Maximum reduction (rank-2) ---
	static void BM_MaxReduction(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	A.setRandom();

	for (auto _ : state) {
	Tensor<Scalar, 0> result = A.maximum();
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
	}

	static void ReductionSizes(::benchmark::Benchmark* b) {
	for (int size : {64, 256, 1024}) {
	b->Args({size, size});
	}
	}

	static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
	for (int size : {256, 1024}) {
	for (int threads : {2, 4, 8}) {
	b->Args({size, size, threads});
	}
	}
	}

	static void SpatialSizes(::benchmark::Benchmark* b) {
	for (int batch : {1, 8, 32}) {
	for (int c : {64, 128}) {
	for (int h : {16, 32}) {
	b->Args({batch, c, h});
	}
	}
	}
	}

	BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
	BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
	BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
	BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
	BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
	BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
	BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);