bench/benchmark-blocking-sizes.cpp - eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
 #include <vector>
 #include <fstream>
 #include <memory>
 #include <cstdio>

 bool eigen_use_specific_block_size;
 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
 #include <Eigen/Core>

 #include <bench/BenchTimer.h>

 using namespace Eigen;
 using namespace std;

 static BenchTimer timer;

 // how many times we repeat each measurement.
 // measurements are randomly shuffled - we're not doing
 // all N identical measurements in a row.
 const int measurement_repetitions = 3;

 // Timings below this value are too short to be accurate,
 // we'll repeat measurements with more iterations until
 // we get a timing above that threshold.
 const float min_accurate_time = 1e-2f;

 // See --min-working-set-size command line parameter.
 size_t min_working_set_size = 0;

 float max_clock_speed = 0.0f;

 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
 const size_t maxsize = 2048;
 const size_t minsize = 16;

 typedef MatrixXf MatrixType;
 typedef MatrixType::Scalar Scalar;
 typedef internal::packet_traits<Scalar>::type Packet;

 static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
 static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
 static_assert(maxsize > minsize, "maxsize must be larger than minsize");
 static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");

 // just a helper to store a triple of K,M,N sizes for matrix product
 struct size_triple_t
 {
   size_t k, m, n;
   size_triple_t() : k(0), m(0), n(0) {}
   size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
   size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
   size_triple_t(uint16_t compact)
   {
     k = 1 << ((compact & 0xf00) >> 8);
     m = 1 << ((compact & 0x0f0) >> 4);
     n = 1 << ((compact & 0x00f) >> 0);
   }
 };

 uint8_t log2_pot(size_t x) {
   size_t l = 0;
   while (x >>= 1) l++;
   return l;
 }

 // Convert between size tripes and a compact form fitting in 12 bits
 // where each size, which must be a POT, is encoded as its log2, on 4 bits
 // so the largest representable size is 2^15 == 32k  ... big enough.
 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
 {
   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
 }

 uint16_t compact_size_triple(const size_triple_t& t)
 {
   return compact_size_triple(t.k, t.m, t.n);
 }

 // A single benchmark. Initially only contains benchmark params.
 // Then call run(), which stores the result in the gflops field.
 struct benchmark_t
 {
   uint16_t compact_product_size;
   uint16_t compact_block_size;
   bool use_default_block_size;
   float gflops;
   benchmark_t()
     : compact_product_size(0)
     , compact_block_size(0)
     , use_default_block_size(false)
     , gflops(0)
   {
   }
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
     , use_default_block_size(false)
     , gflops(0)
   {}
   benchmark_t(size_t pk, size_t pm, size_t pn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(0)
     , use_default_block_size(true)
     , gflops(0)
   {}

   void run();
 };

 ostream& operator<<(ostream& s, const benchmark_t& b)
 {
   s << hex << b.compact_product_size << dec;
   if (b.use_default_block_size) {
     size_triple_t t(b.compact_product_size);
     Index k = t.k, m = t.m, n = t.n;
     internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
     s << " default(" << k << ", " << m << ", " << n << ")";
   } else {
     s << " " << hex << b.compact_block_size << dec;
   }
   s << " " << b.gflops;
   return s;
 }

 // We sort first by increasing benchmark parameters,
 // then by decreasing performance.
 bool operator<(const benchmark_t& b1, const benchmark_t& b2)
 {
   return b1.compact_product_size < b2.compact_product_size ||
            (b1.compact_product_size == b2.compact_product_size && (
              (b1.compact_block_size < b2.compact_block_size || (
                b1.compact_block_size == b2.compact_block_size &&
                  b1.gflops > b2.gflops))));
 }

 void benchmark_t::run()
 {
   size_triple_t productsizes(compact_product_size);

   if (use_default_block_size) {
     eigen_use_specific_block_size = false;
   } else {
     // feed eigen with our custom blocking params
     eigen_use_specific_block_size = true;
     size_triple_t blocksizes(compact_block_size);
     eigen_block_size_k = blocksizes.k;
     eigen_block_size_m = blocksizes.m;
     eigen_block_size_n = blocksizes.n;
   }

   // set up the matrix pool

   const size_t combined_three_matrices_sizes =
     sizeof(Scalar) *
       (productsizes.k * productsizes.m +
        productsizes.k * productsizes.n +
        productsizes.m * productsizes.n);

   // 64 M is large enough that nobody has a cache bigger than that,
   // while still being small enough that everybody has this much RAM,
   // so conveniently we don't need to special-case platforms here.
   const size_t unlikely_large_cache_size = 64 << 20;

   const size_t working_set_size =
     min_working_set_size ? min_working_set_size : unlikely_large_cache_size;

   const size_t matrix_pool_size =
     1 + working_set_size / combined_three_matrices_sizes;

   MatrixType *lhs = new MatrixType[matrix_pool_size];
   MatrixType *rhs = new MatrixType[matrix_pool_size];
   MatrixType *dst = new MatrixType[matrix_pool_size];

   for (size_t i = 0; i < matrix_pool_size; i++) {
     lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
     rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
     dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
   }

   // main benchmark loop

   int iters_at_a_time = 1;
   float time_per_iter = 0.0f;
   size_t matrix_index = 0;
   while (true) {

     double starttime = timer.getCpuTime();
     for (int i = 0; i < iters_at_a_time; i++) {
       dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
       matrix_index++;
       if (matrix_index == matrix_pool_size) {
         matrix_index = 0;
       }
     }
     double endtime = timer.getCpuTime();

     const float timing = float(endtime - starttime);

     if (timing >= min_accurate_time) {
       time_per_iter = timing / iters_at_a_time;
       break;
     }

     iters_at_a_time *= 2;
   }

   delete[] lhs;
   delete[] rhs;
   delete[] dst;

   gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
 }

 void print_cpuinfo()
 {
 #ifdef __linux__
   cout << "contents of /proc/cpuinfo:" << endl;
   string line;
   ifstream cpuinfo("/proc/cpuinfo");
   if (cpuinfo.is_open()) {
     while (getline(cpuinfo, line)) {
       cout << line << endl;
     }
     cpuinfo.close();
   }
   cout << endl;
 #elif defined __APPLE__
   cout << "output of sysctl hw:" << endl;
   system("sysctl hw");
   cout << endl;
 #endif
 }

 template <typename T>
 string type_name()
 {
   return "unknown";
 }

 template<>
 string type_name<float>()
 {
   return "float";
 }

 template<>
 string type_name<double>()
 {
   return "double";
 }

 struct action_t
 {
   virtual const char* invokation_name() const { abort(); return nullptr; }
   virtual void run() const { abort(); }
   virtual ~action_t() {}
 };

 void show_usage_and_exit(int /*argc*/, char* argv[],
                          const vector<unique_ptr<action_t>>& available_actions)
 {
   cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
   cerr << "available actions:" << endl << endl;
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
     cerr << "  " << (*it)->invokation_name() << endl;
   }
   cerr << endl;
   cerr << "options:" << endl << endl;
   cerr << "  --min-working-set-size=N:" << endl;
   cerr << "       Set the minimum working set size to N bytes." << endl;
   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
   cerr << "       A larger working set lowers the chance of a warm cache." << endl;
   cerr << "       The default value 0 means use a large enough working" << endl;
   cerr << "       set to likely outsize caches." << endl;
   cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
   cerr << "       avoid warm caches." << endl;
   exit(1);
 }

 float measure_clock_speed()
 {
   cerr << "Measuring clock speed...                              \r" << flush;

   vector<float> all_gflops;
   for (int i = 0; i < 8; i++) {
     benchmark_t b(1024, 1024, 1024);
     b.run();
     all_gflops.push_back(b.gflops);
   }

   sort(all_gflops.begin(), all_gflops.end());
   float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];

   // multiply by an arbitrary constant to discourage trying doing anything with the
   // returned values besides just comparing them with each other.
   float result = stable_estimate * 123.456f;

   return result;
 }

 struct human_duration_t
 {
   int seconds;
   human_duration_t(int s) : seconds(s) {}
 };

 ostream& operator<<(ostream& s, const human_duration_t& d)
 {
   int remainder = d.seconds;
   if (remainder > 3600) {
     int hours = remainder / 3600;
     s << hours << " h ";
     remainder -= hours * 3600;
   }
   if (remainder > 60) {
     int minutes = remainder / 60;
     s << minutes << " min ";
     remainder -= minutes * 60;
   }
   if (d.seconds < 600) {
     s << remainder << " s";
   }
   return s;
 }

 const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";

 void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
 {
   FILE* file = fopen(filename, "w");
   if (!file) {
     cerr << "Could not open file " << filename << " for writing." << endl;
     cerr << "Do you have write permissions on the current working directory?" << endl;
     exit(1);
   }
   size_t benchmarks_vector_size = benchmarks.size();
   fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
   fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
   fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
   fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
   fclose(file);
 }

 bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
 {
   FILE* file = fopen(filename, "r");
   if (!file) {
     return false;
   }
   if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
     return false;
   }
   size_t benchmarks_vector_size = 0;
   if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
     return false;
   }
   if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
     return false;
   }
   benchmarks.resize(benchmarks_vector_size);
   if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
     return false;
   }
   unlink(filename);
   return true;
 }

 void try_run_some_benchmarks(
   vector<benchmark_t>& benchmarks,
   double time_start,
   size_t& first_benchmark_to_run)
 {
   if (first_benchmark_to_run == benchmarks.size()) {
     return;
   }

   double time_last_progress_update = 0;
   double time_last_clock_speed_measurement = 0;
   double time_now = 0;

   size_t benchmark_index = first_benchmark_to_run;

   while (true) {
     float ratio_done = float(benchmark_index) / benchmarks.size();
     time_now = timer.getRealTime();

     // We check clock speed every minute and at the end.
     if (benchmark_index == benchmarks.size() ||
         time_now > time_last_clock_speed_measurement + 60.0f)
     {
       time_last_clock_speed_measurement = time_now;

       // Ensure that clock speed is as expected
       float current_clock_speed = measure_clock_speed();

       // The tolerance needs to be smaller than the relative difference between
       // clock speeds that a device could operate under.
       // It seems unlikely that a device would be throttling clock speeds by
       // amounts smaller than 2%.
       // With a value of 1%, I was getting within noise on a Sandy Bridge.
       const float clock_speed_tolerance = 0.02f;

       if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
         // Clock speed is now higher than we previously measured.
         // Either our initial measurement was inaccurate, which won't happen
         // too many times as we are keeping the best clock speed value and
         // and allowing some tolerance; or something really weird happened,
         // which invalidates all benchmark results collected so far.
         // Either way, we better restart all over again now.
         if (benchmark_index) {
           cerr << "Restarting at " << 100.0f * ratio_done
                << " % because clock speed increased.          " << endl;
         }
         max_clock_speed = current_clock_speed;
         first_benchmark_to_run = 0;
         return;
       }

       bool rerun_last_tests = false;

       if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
         cerr << "Measurements completed so far: "
              << 100.0f * ratio_done
              << " %                             " << endl;
         cerr << "Clock speed seems to be only "
              << current_clock_speed/max_clock_speed
              << " times what it used to be." << endl;

         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;

         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
           if (seconds_to_sleep_if_lower_clock_speed > 32) {
             cerr << "Sleeping longer probably won't make a difference." << endl;
             cerr << "Serializing benchmarks to " << session_filename << endl;
             serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
             cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
             exit(2);
           }
           rerun_last_tests = true;
           cerr << "Sleeping "
                << seconds_to_sleep_if_lower_clock_speed
                << " s...                                   \r" << endl;
           sleep(seconds_to_sleep_if_lower_clock_speed);
           current_clock_speed = measure_clock_speed();
           seconds_to_sleep_if_lower_clock_speed *= 2;
         }
       }

       if (rerun_last_tests) {
         cerr << "Redoing the last "
              << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
              << " % because clock speed had been low.   " << endl;
         return;
       }

       // nothing wrong with the clock speed so far, so there won't be a need to rerun
       // benchmarks run so far in case we later encounter a lower clock speed.
       first_benchmark_to_run = benchmark_index;
     }

     if (benchmark_index == benchmarks.size()) {
       // We're done!
       first_benchmark_to_run = benchmarks.size();
       // Erase progress info
       cerr << "                                                            " << endl;
       return;
     }

     // Display progress info on stderr
     if (time_now > time_last_progress_update + 1.0f) {
       time_last_progress_update = time_now;
       cerr << "Measurements... " << 100.0f * ratio_done
            << " %, ETA "
            << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
            << "                          \r" << flush;
     }

     // This is where we actually run a benchmark!
     benchmarks[benchmark_index].run();
     benchmark_index++;
   }
 }

 void run_benchmarks(vector<benchmark_t>& benchmarks)
 {
   size_t first_benchmark_to_run;
   vector<benchmark_t> deserialized_benchmarks;
   bool use_deserialized_benchmarks = false;
   if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
     cerr << "Found serialized session with "
          << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
          << " % already done" << endl;
     if (deserialized_benchmarks.size() == benchmarks.size() &&
         first_benchmark_to_run > 0 &&
         first_benchmark_to_run < benchmarks.size())
     {
       use_deserialized_benchmarks = true;
     }
   }

   if (use_deserialized_benchmarks) {
     benchmarks = deserialized_benchmarks;
   } else {
     // not using deserialized benchmarks, starting from scratch
     first_benchmark_to_run = 0;

     // Randomly shuffling benchmarks allows us to get accurate enough progress info,
     // as now the cheap/expensive benchmarks are randomly mixed so they average out.
     // It also means that if data is corrupted for some time span, the odds are that
     // not all repetitions of a given benchmark will be corrupted.
     random_shuffle(benchmarks.begin(), benchmarks.end());
   }

   for (int i = 0; i < 4; i++) {
     max_clock_speed = max(max_clock_speed, measure_clock_speed());
   }

   double time_start = 0.0;
   while (first_benchmark_to_run < benchmarks.size()) {
     if (first_benchmark_to_run == 0) {
       time_start = timer.getRealTime();
     }
     try_run_some_benchmarks(benchmarks,
                             time_start,
                             first_benchmark_to_run);
   }

   // Sort timings by increasing benchmark parameters, and decreasing gflops.
   // The latter is very important. It means that we can ignore all but the first
   // benchmark with given parameters.
   sort(benchmarks.begin(), benchmarks.end());

   // Collect best (i.e. now first) results for each parameter values.
   vector<benchmark_t> best_benchmarks;
   for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
     if (best_benchmarks.empty() ||
         best_benchmarks.back().compact_product_size != it->compact_product_size ||
         best_benchmarks.back().compact_block_size != it->compact_block_size)
     {
       best_benchmarks.push_back(*it);
     }
   }

   // keep and return only the best benchmarks
   benchmarks = best_benchmarks;
 }

 struct measure_all_pot_sizes_action_t : action_t
 {
   virtual const char* invokation_name() const { return "all-pot-sizes"; }
   virtual void run() const
   {
     vector<benchmark_t> benchmarks;
     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
             for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
               for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
                 for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
                   benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
                 }
               }
             }
           }
         }
       }
     }

     run_benchmarks(benchmarks);

     cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
       cout << *it << endl;
     }
   }
 };

 struct measure_default_sizes_action_t : action_t
 {
   virtual const char* invokation_name() const { return "default-sizes"; }
   virtual void run() const
   {
     vector<benchmark_t> benchmarks;
     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
             benchmarks.emplace_back(ksize, msize, nsize);
           }
         }
       }
     }

     run_benchmarks(benchmarks);

     cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
       cout << *it << endl;
     }
   }
 };

 int main(int argc, char* argv[])
 {
   double time_start = timer.getRealTime();
   cout.precision(4);
   cerr.precision(4);

   vector<unique_ptr<action_t>> available_actions;
   available_actions.emplace_back(new measure_all_pot_sizes_action_t);
   available_actions.emplace_back(new measure_default_sizes_action_t);

   auto action = available_actions.end();

   if (argc <= 1) {
     show_usage_and_exit(argc, argv, available_actions);
   }
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
     if (!strcmp(argv[1], (*it)->invokation_name())) {
       action = it;
       break;
     }
   }

   if (action == available_actions.end()) {
     show_usage_and_exit(argc, argv, available_actions);
   }

   for (int i = 2; i < argc; i++) {
     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
       const char* equals_sign = strchr(argv[i], '=');
       min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
     } else {
       cerr << "unrecognized option: " << argv[i] << endl << endl;
       show_usage_and_exit(argc, argv, available_actions);
     }
   }

   print_cpuinfo();

   cout << "benchmark parameters:" << endl;
   cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
   cout << "scalar type: " << type_name<Scalar>() << endl;
   cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
   cout << "minsize = " << minsize << endl;
   cout << "maxsize = " << maxsize << endl;
   cout << "measurement_repetitions = " << measurement_repetitions << endl;
   cout << "min_accurate_time = " << min_accurate_time << endl;
   cout << "min_working_set_size = " << min_working_set_size;
   if (min_working_set_size == 0) {
     cout << " (try to outsize caches)";
   }
   cout << endl << endl;

   (*action)->run();

   double time_end = timer.getRealTime();
   cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
 }
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#include <iostream>
	#include <cstdint>
	#include <cstdlib>
	#include <vector>
	#include <fstream>
	#include <memory>
	#include <cstdio>

	bool eigen_use_specific_block_size;
	int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
	#include <Eigen/Core>

	#include <bench/BenchTimer.h>

	using namespace Eigen;
	using namespace std;

	static BenchTimer timer;

	// how many times we repeat each measurement.
	// measurements are randomly shuffled - we're not doing
	// all N identical measurements in a row.
	const int measurement_repetitions = 3;

	// Timings below this value are too short to be accurate,
	// we'll repeat measurements with more iterations until
	// we get a timing above that threshold.
	const float min_accurate_time = 1e-2f;

	// See --min-working-set-size command line parameter.
	size_t min_working_set_size = 0;

	float max_clock_speed = 0.0f;

	// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
	const size_t maxsize = 2048;
	const size_t minsize = 16;

	typedef MatrixXf MatrixType;
	typedef MatrixType::Scalar Scalar;
	typedef internal::packet_traits<Scalar>::type Packet;

	static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
	static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
	static_assert(maxsize > minsize, "maxsize must be larger than minsize");
	static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");

	// just a helper to store a triple of K,M,N sizes for matrix product
	struct size_triple_t
	{
	size_t k, m, n;
	size_triple_t() : k(0), m(0), n(0) {}
	size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
	size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
	size_triple_t(uint16_t compact)
	{
	k = 1 << ((compact & 0xf00) >> 8);
	m = 1 << ((compact & 0x0f0) >> 4);
	n = 1 << ((compact & 0x00f) >> 0);
	}
	};

	uint8_t log2_pot(size_t x) {
	size_t l = 0;
	while (x >>= 1) l++;
	return l;
	}

	// Convert between size tripes and a compact form fitting in 12 bits
	// where each size, which must be a POT, is encoded as its log2, on 4 bits
	// so the largest representable size is 2^15 == 32k ... big enough.
	uint16_t compact_size_triple(size_t k, size_t m, size_t n)
	{
	return (log2_pot(k) << 8) \| (log2_pot(m) << 4) \| log2_pot(n);
	}

	uint16_t compact_size_triple(const size_triple_t& t)
	{
	return compact_size_triple(t.k, t.m, t.n);
	}

	// A single benchmark. Initially only contains benchmark params.
	// Then call run(), which stores the result in the gflops field.
	struct benchmark_t
	{
	uint16_t compact_product_size;
	uint16_t compact_block_size;
	bool use_default_block_size;
	float gflops;
	benchmark_t()
	: compact_product_size(0)
	, compact_block_size(0)
	, use_default_block_size(false)
	, gflops(0)
	{
	}
	benchmark_t(size_t pk, size_t pm, size_t pn,
	size_t bk, size_t bm, size_t bn)
	: compact_product_size(compact_size_triple(pk, pm, pn))
	, compact_block_size(compact_size_triple(bk, bm, bn))
	, use_default_block_size(false)
	, gflops(0)
	{}
	benchmark_t(size_t pk, size_t pm, size_t pn)
	: compact_product_size(compact_size_triple(pk, pm, pn))
	, compact_block_size(0)
	, use_default_block_size(true)
	, gflops(0)
	{}

	void run();
	};

	ostream& operator<<(ostream& s, const benchmark_t& b)
	{
	s << hex << b.compact_product_size << dec;
	if (b.use_default_block_size) {
	size_triple_t t(b.compact_product_size);
	Index k = t.k, m = t.m, n = t.n;
	internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
	s << " default(" << k << ", " << m << ", " << n << ")";
	} else {
	s << " " << hex << b.compact_block_size << dec;
	}
	s << " " << b.gflops;
	return s;
	}

	// We sort first by increasing benchmark parameters,
	// then by decreasing performance.
	bool operator<(const benchmark_t& b1, const benchmark_t& b2)
	{
	return b1.compact_product_size < b2.compact_product_size \|\|
	(b1.compact_product_size == b2.compact_product_size && (
	(b1.compact_block_size < b2.compact_block_size \|\| (
	b1.compact_block_size == b2.compact_block_size &&
	b1.gflops > b2.gflops))));
	}

	void benchmark_t::run()
	{
	size_triple_t productsizes(compact_product_size);

	if (use_default_block_size) {
	eigen_use_specific_block_size = false;
	} else {
	// feed eigen with our custom blocking params
	eigen_use_specific_block_size = true;
	size_triple_t blocksizes(compact_block_size);
	eigen_block_size_k = blocksizes.k;
	eigen_block_size_m = blocksizes.m;
	eigen_block_size_n = blocksizes.n;
	}

	// set up the matrix pool

	const size_t combined_three_matrices_sizes =
	sizeof(Scalar) *
	(productsizes.k * productsizes.m +
	productsizes.k * productsizes.n +
	productsizes.m * productsizes.n);

	// 64 M is large enough that nobody has a cache bigger than that,
	// while still being small enough that everybody has this much RAM,
	// so conveniently we don't need to special-case platforms here.
	const size_t unlikely_large_cache_size = 64 << 20;

	const size_t working_set_size =
	min_working_set_size ? min_working_set_size : unlikely_large_cache_size;

	const size_t matrix_pool_size =
	1 + working_set_size / combined_three_matrices_sizes;

	MatrixType *lhs = new MatrixType[matrix_pool_size];
	MatrixType *rhs = new MatrixType[matrix_pool_size];
	MatrixType *dst = new MatrixType[matrix_pool_size];

	for (size_t i = 0; i < matrix_pool_size; i++) {
	lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
	rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
	dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
	}

	// main benchmark loop

	int iters_at_a_time = 1;
	float time_per_iter = 0.0f;
	size_t matrix_index = 0;
	while (true) {

	double starttime = timer.getCpuTime();
	for (int i = 0; i < iters_at_a_time; i++) {
	dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
	matrix_index++;
	if (matrix_index == matrix_pool_size) {
	matrix_index = 0;
	}
	}
	double endtime = timer.getCpuTime();

	const float timing = float(endtime - starttime);

	if (timing >= min_accurate_time) {
	time_per_iter = timing / iters_at_a_time;
	break;
	}

	iters_at_a_time *= 2;
	}

	delete[] lhs;
	delete[] rhs;
	delete[] dst;

	gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
	}

	void print_cpuinfo()
	{
	#ifdef __linux__
	cout << "contents of /proc/cpuinfo:" << endl;
	string line;
	ifstream cpuinfo("/proc/cpuinfo");
	if (cpuinfo.is_open()) {
	while (getline(cpuinfo, line)) {
	cout << line << endl;
	}
	cpuinfo.close();
	}
	cout << endl;
	#elif defined __APPLE__
	cout << "output of sysctl hw:" << endl;
	system("sysctl hw");
	cout << endl;
	#endif
	}

	template <typename T>
	string type_name()
	{
	return "unknown";
	}

	template<>
	string type_name<float>()
	{
	return "float";
	}

	template<>
	string type_name<double>()
	{
	return "double";
	}

	struct action_t
	{
	virtual const char* invokation_name() const { abort(); return nullptr; }
	virtual void run() const { abort(); }
	virtual ~action_t() {}
	};

	void show_usage_and_exit(int /argc/, char* argv[],
	const vector<unique_ptr<action_t>>& available_actions)
	{
	cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
	cerr << "available actions:" << endl << endl;
	for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
	cerr << " " << (*it)->invokation_name() << endl;
	}
	cerr << endl;
	cerr << "options:" << endl << endl;
	cerr << " --min-working-set-size=N:" << endl;
	cerr << " Set the minimum working set size to N bytes." << endl;
	cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
	cerr << " A larger working set lowers the chance of a warm cache." << endl;
	cerr << " The default value 0 means use a large enough working" << endl;
	cerr << " set to likely outsize caches." << endl;
	cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
	cerr << " avoid warm caches." << endl;
	exit(1);
	}

	float measure_clock_speed()
	{
	cerr << "Measuring clock speed... \r" << flush;

	vector<float> all_gflops;
	for (int i = 0; i < 8; i++) {
	benchmark_t b(1024, 1024, 1024);
	b.run();
	all_gflops.push_back(b.gflops);
	}

	sort(all_gflops.begin(), all_gflops.end());
	float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];

	// multiply by an arbitrary constant to discourage trying doing anything with the
	// returned values besides just comparing them with each other.
	float result = stable_estimate * 123.456f;

	return result;
	}

	struct human_duration_t
	{
	int seconds;
	human_duration_t(int s) : seconds(s) {}
	};

	ostream& operator<<(ostream& s, const human_duration_t& d)
	{
	int remainder = d.seconds;
	if (remainder > 3600) {
	int hours = remainder / 3600;
	s << hours << " h ";
	remainder -= hours * 3600;
	}
	if (remainder > 60) {
	int minutes = remainder / 60;
	s << minutes << " min ";
	remainder -= minutes * 60;
	}
	if (d.seconds < 600) {
	s << remainder << " s";
	}
	return s;
	}

	const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";

	void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
	{
	FILE* file = fopen(filename, "w");
	if (!file) {
	cerr << "Could not open file " << filename << " for writing." << endl;
	cerr << "Do you have write permissions on the current working directory?" << endl;
	exit(1);
	}
	size_t benchmarks_vector_size = benchmarks.size();
	fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
	fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
	fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
	fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
	fclose(file);
	}

	bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
	{
	FILE* file = fopen(filename, "r");
	if (!file) {
	return false;
	}
	if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
	return false;
	}
	size_t benchmarks_vector_size = 0;
	if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
	return false;
	}
	if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
	return false;
	}
	benchmarks.resize(benchmarks_vector_size);
	if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
	return false;
	}
	unlink(filename);
	return true;
	}

	void try_run_some_benchmarks(
	vector<benchmark_t>& benchmarks,
	double time_start,
	size_t& first_benchmark_to_run)
	{
	if (first_benchmark_to_run == benchmarks.size()) {
	return;
	}

	double time_last_progress_update = 0;
	double time_last_clock_speed_measurement = 0;
	double time_now = 0;

	size_t benchmark_index = first_benchmark_to_run;

	while (true) {
	float ratio_done = float(benchmark_index) / benchmarks.size();
	time_now = timer.getRealTime();

	// We check clock speed every minute and at the end.
	if (benchmark_index == benchmarks.size() \|\|
	time_now > time_last_clock_speed_measurement + 60.0f)
	{
	time_last_clock_speed_measurement = time_now;

	// Ensure that clock speed is as expected
	float current_clock_speed = measure_clock_speed();

	// The tolerance needs to be smaller than the relative difference between
	// clock speeds that a device could operate under.
	// It seems unlikely that a device would be throttling clock speeds by
	// amounts smaller than 2%.
	// With a value of 1%, I was getting within noise on a Sandy Bridge.
	const float clock_speed_tolerance = 0.02f;

	if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
	// Clock speed is now higher than we previously measured.
	// Either our initial measurement was inaccurate, which won't happen
	// too many times as we are keeping the best clock speed value and
	// and allowing some tolerance; or something really weird happened,
	// which invalidates all benchmark results collected so far.
	// Either way, we better restart all over again now.
	if (benchmark_index) {
	cerr << "Restarting at " << 100.0f * ratio_done
	<< " % because clock speed increased. " << endl;
	}
	max_clock_speed = current_clock_speed;
	first_benchmark_to_run = 0;
	return;
	}

	bool rerun_last_tests = false;

	if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
	cerr << "Measurements completed so far: "
	<< 100.0f * ratio_done
	<< " % " << endl;
	cerr << "Clock speed seems to be only "
	<< current_clock_speed/max_clock_speed
	<< " times what it used to be." << endl;

	unsigned int seconds_to_sleep_if_lower_clock_speed = 1;

	while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
	if (seconds_to_sleep_if_lower_clock_speed > 32) {
	cerr << "Sleeping longer probably won't make a difference." << endl;
	cerr << "Serializing benchmarks to " << session_filename << endl;
	serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
	cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
	exit(2);
	}
	rerun_last_tests = true;
	cerr << "Sleeping "
	<< seconds_to_sleep_if_lower_clock_speed
	<< " s... \r" << endl;
	sleep(seconds_to_sleep_if_lower_clock_speed);
	current_clock_speed = measure_clock_speed();
	seconds_to_sleep_if_lower_clock_speed *= 2;
	}
	}

	if (rerun_last_tests) {
	cerr << "Redoing the last "
	<< 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
	<< " % because clock speed had been low. " << endl;
	return;
	}

	// nothing wrong with the clock speed so far, so there won't be a need to rerun
	// benchmarks run so far in case we later encounter a lower clock speed.
	first_benchmark_to_run = benchmark_index;
	}

	if (benchmark_index == benchmarks.size()) {
	// We're done!
	first_benchmark_to_run = benchmarks.size();
	// Erase progress info
	cerr << " " << endl;
	return;
	}

	// Display progress info on stderr
	if (time_now > time_last_progress_update + 1.0f) {
	time_last_progress_update = time_now;
	cerr << "Measurements... " << 100.0f * ratio_done
	<< " %, ETA "
	<< human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
	<< " \r" << flush;
	}

	// This is where we actually run a benchmark!
	benchmarks[benchmark_index].run();
	benchmark_index++;
	}
	}

	void run_benchmarks(vector<benchmark_t>& benchmarks)
	{
	size_t first_benchmark_to_run;
	vector<benchmark_t> deserialized_benchmarks;
	bool use_deserialized_benchmarks = false;
	if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
	cerr << "Found serialized session with "
	<< 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
	<< " % already done" << endl;
	if (deserialized_benchmarks.size() == benchmarks.size() &&
	first_benchmark_to_run > 0 &&
	first_benchmark_to_run < benchmarks.size())
	{
	use_deserialized_benchmarks = true;
	}
	}

	if (use_deserialized_benchmarks) {
	benchmarks = deserialized_benchmarks;
	} else {
	// not using deserialized benchmarks, starting from scratch
	first_benchmark_to_run = 0;

	// Randomly shuffling benchmarks allows us to get accurate enough progress info,
	// as now the cheap/expensive benchmarks are randomly mixed so they average out.
	// It also means that if data is corrupted for some time span, the odds are that
	// not all repetitions of a given benchmark will be corrupted.
	random_shuffle(benchmarks.begin(), benchmarks.end());
	}

	for (int i = 0; i < 4; i++) {
	max_clock_speed = max(max_clock_speed, measure_clock_speed());
	}

	double time_start = 0.0;
	while (first_benchmark_to_run < benchmarks.size()) {
	if (first_benchmark_to_run == 0) {
	time_start = timer.getRealTime();
	}
	try_run_some_benchmarks(benchmarks,
	time_start,
	first_benchmark_to_run);
	}

	// Sort timings by increasing benchmark parameters, and decreasing gflops.
	// The latter is very important. It means that we can ignore all but the first
	// benchmark with given parameters.
	sort(benchmarks.begin(), benchmarks.end());

	// Collect best (i.e. now first) results for each parameter values.
	vector<benchmark_t> best_benchmarks;
	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
	if (best_benchmarks.empty() \|\|
	best_benchmarks.back().compact_product_size != it->compact_product_size \|\|
	best_benchmarks.back().compact_block_size != it->compact_block_size)
	{
	best_benchmarks.push_back(*it);
	}
	}

	// keep and return only the best benchmarks
	benchmarks = best_benchmarks;
	}

	struct measure_all_pot_sizes_action_t : action_t
	{
	virtual const char* invokation_name() const { return "all-pot-sizes"; }
	virtual void run() const
	{
	vector<benchmark_t> benchmarks;
	for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
	for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
	for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
	for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
	for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
	for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
	for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
	benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
	}
	}
	}
	}
	}
	}
	}

	run_benchmarks(benchmarks);

	cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
	cout << *it << endl;
	}
	}
	};

	struct measure_default_sizes_action_t : action_t
	{
	virtual const char* invokation_name() const { return "default-sizes"; }
	virtual void run() const
	{
	vector<benchmark_t> benchmarks;
	for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
	for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
	for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
	for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
	benchmarks.emplace_back(ksize, msize, nsize);
	}
	}
	}
	}

	run_benchmarks(benchmarks);

	cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
	cout << *it << endl;
	}
	}
	};

	int main(int argc, char* argv[])
	{
	double time_start = timer.getRealTime();
	cout.precision(4);
	cerr.precision(4);

	vector<unique_ptr<action_t>> available_actions;
	available_actions.emplace_back(new measure_all_pot_sizes_action_t);
	available_actions.emplace_back(new measure_default_sizes_action_t);

	auto action = available_actions.end();

	if (argc <= 1) {
	show_usage_and_exit(argc, argv, available_actions);
	}
	for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
	if (!strcmp(argv[1], (*it)->invokation_name())) {
	action = it;
	break;
	}
	}

	if (action == available_actions.end()) {
	show_usage_and_exit(argc, argv, available_actions);
	}

	for (int i = 2; i < argc; i++) {
	if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
	const char* equals_sign = strchr(argv[i], '=');
	min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
	} else {
	cerr << "unrecognized option: " << argv[i] << endl << endl;
	show_usage_and_exit(argc, argv, available_actions);
	}
	}

	print_cpuinfo();

	cout << "benchmark parameters:" << endl;
	cout << "pointer size: " << 8sizeof(void) << " bits" << endl;
	cout << "scalar type: " << type_name<Scalar>() << endl;
	cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
	cout << "minsize = " << minsize << endl;
	cout << "maxsize = " << maxsize << endl;
	cout << "measurement_repetitions = " << measurement_repetitions << endl;
	cout << "min_accurate_time = " << min_accurate_time << endl;
	cout << "min_working_set_size = " << min_working_set_size;
	if (min_working_set_size == 0) {
	cout << " (try to outsize caches)";
	}
	cout << endl << endl;

	(*action)->run();

	double time_end = timer.getRealTime();
	cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
	}