unsupported/Eigen/CXX11/src/Tensor/TensorRunQueue.h - eigen/fuchsia - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_
 #define EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_

 #include <atomic>
 #include <vector>
 using ::std::vector;

 #include "Eigen/src/Core/util/Macros.h"

 namespace Eigen {

 // RunQueue is a fixed-size, partially non-blocking deque or Work items.
 // Operations on front of the queue must be done by a single thread (owner),
 // operations on back of the queue can be done by multiple threads concurrently.
 //
 // Algorithm outline:
 // All remote threads operating on the queue back are serialized by a mutex.
 // This ensures that at most two threads access state: owner and one remote
 // thread (Size aside). The algorithm ensures that the occupied region of the
 // underlying array is logically continuous (can wraparound, but no stray
 // occupied elements). Owner operates on one end of this region, remote thread
 // operates on the other end. Synchronization between these threads
 // (potential consumption of the last element and take up of the last empty
 // element) happens by means of state variable in each element. States are:
 // empty, busy (in process of insertion of removal) and ready. Threads claim
 // elements (empty->busy and ready->busy transitions) by means of a CAS
 // operation. The finishing transition (busy->empty and busy->ready) are done
 // with plain store as the element is exclusively owned by the current thread.
 //
 // Note: we could permit only pointers as elements, then we would not need
 // separate state variable as null/non-null pointer value would serve as state,
 // but that would require malloc/free per operation for large, complex values
 // (and this is designed to store std::function<()>).
 template <typename Work, unsigned kSize>
 class RunQueue {
  public:
   RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
     eigen_assert((kSize & (kSize - 1)) == 0);
     eigen_assert(kSize > 2);            // why would you do this?
     eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
     for (unsigned i = 0; i < kSize; i++)
       array_[i].state.store(kEmpty, std::memory_order_relaxed);
   }

   ~RunQueue() { eigen_assert(Size() == 0); }

   // PushFront inserts w at the beginning of the queue.
   // If queue is full returns w, otherwise returns default-constructed Work.
   Work PushFront(Work w) {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[front & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty ||
         !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
       return w;
     front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
   }

   // PopFront removes and returns the first element in the queue.
   // If the queue was empty returns default-constructed Work.
   Work PopFront() {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[(front - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kReady ||
         !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
       return Work();
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
     front = ((front - 1) & kMask2) | (front & ~kMask2);
     front_.store(front, std::memory_order_relaxed);
     return w;
   }

   // PushBack adds w at the end of the queue.
   // If queue is full returns w, otherwise returns default-constructed Work.
   Work PushBack(Work w) {
     mutex_lock lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[(back - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty ||
         !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
       return w;
     back = ((back - 1) & kMask2) | (back & ~kMask2);
     back_.store(back, std::memory_order_relaxed);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
   }

   // PopBack removes and returns the last elements in the queue.
   // Can fail spuriously.
   Work PopBack() {
     if (Empty()) return Work();
     mutex_lock lock(mutex_, std::try_to_lock);
     if (!lock) return Work();
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kReady ||
         !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
       return Work();
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
     back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
     return w;
   }

   // PopBackHalf removes and returns half last elements in the queue.
   // Returns number of elements removed. But can also fail spuriously.
   unsigned PopBackHalf(std::vector<Work>* result) {
     if (Empty()) return 0;
     mutex_lock lock(mutex_, std::try_to_lock);
     if (!lock) return 0;
     unsigned back = back_.load(std::memory_order_relaxed);
     unsigned size = Size();
     unsigned mid = back;
     if (size > 1) mid = back + (size - 1) / 2;
     unsigned n = 0;
     unsigned start = 0;
     for (; static_cast<int>(mid - back) >= 0; mid--) {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
         if (s != kReady ||
             !e->state.compare_exchange_strong(s, kBusy,
                                               std::memory_order_acquire))
           continue;
         start = mid;
       } else {
         // Note: no need to store temporal kBusy, we exclusively own these
         // elements.
         eigen_assert(s == kReady);
       }
       result->push_back(std::move(e->w));
       e->state.store(kEmpty, std::memory_order_release);
       n++;
     }
     if (n != 0)
       back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
     return n;
   }

   // Size returns current queue size.
   // Can be called by any thread at any time.
   unsigned Size() const {
     // Emptiness plays critical role in thread pool blocking. So we go to great
     // effort to not produce false positives (claim non-empty queue as empty).
     for (;;) {
       // Capture a consistent snapshot of front/tail.
       unsigned front = front_.load(std::memory_order_acquire);
       unsigned back = back_.load(std::memory_order_acquire);
       unsigned front1 = front_.load(std::memory_order_relaxed);
       if (front != front1) continue;
       int size = (front & kMask2) - (back & kMask2);
       // Fix overflow.
       if (size < 0) size += 2 * kSize;
       // Order of modification in push/pop is crafted to make the queue look
       // larger than it is during concurrent modifications. E.g. pop can
       // decrement size before the corresponding push has incremented it.
       // So the computed size can be up to kSize + 1, fix it.
       if (size > kSize) size = kSize;
       return size;
     }
   }

   // Empty tests whether container is empty.
   // Can be called by any thread at any time.
   bool Empty() const { return Size() == 0; }

  private:
   static const unsigned kMask = kSize - 1;
   static const unsigned kMask2 = (kSize << 1) - 1;
   struct Elem {
     std::atomic<uint8_t> state;
     Work w;
   };
   enum {
     kEmpty,
     kBusy,
     kReady,
   };
   mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
   // front/back, repsectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain
   // consistent snapshot of front_/back_ for Size operation using the
   // modification counters.
   std::atomic<unsigned> front_;
   std::atomic<unsigned> back_;
   Elem array_[kSize];

   RunQueue(const RunQueue&) = delete;
   void operator=(const RunQueue&) = delete;
 };

 }  // namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_
	#define EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_

	#include <atomic>
	#include <vector>
	using ::std::vector;

	#include "Eigen/src/Core/util/Macros.h"

	namespace Eigen {

	// RunQueue is a fixed-size, partially non-blocking deque or Work items.
	// Operations on front of the queue must be done by a single thread (owner),
	// operations on back of the queue can be done by multiple threads concurrently.
	//
	// Algorithm outline:
	// All remote threads operating on the queue back are serialized by a mutex.
	// This ensures that at most two threads access state: owner and one remote
	// thread (Size aside). The algorithm ensures that the occupied region of the
	// underlying array is logically continuous (can wraparound, but no stray
	// occupied elements). Owner operates on one end of this region, remote thread
	// operates on the other end. Synchronization between these threads
	// (potential consumption of the last element and take up of the last empty
	// element) happens by means of state variable in each element. States are:
	// empty, busy (in process of insertion of removal) and ready. Threads claim
	// elements (empty->busy and ready->busy transitions) by means of a CAS
	// operation. The finishing transition (busy->empty and busy->ready) are done
	// with plain store as the element is exclusively owned by the current thread.
	//
	// Note: we could permit only pointers as elements, then we would not need
	// separate state variable as null/non-null pointer value would serve as state,
	// but that would require malloc/free per operation for large, complex values
	// (and this is designed to store std::function<()>).
	template <typename Work, unsigned kSize>
	class RunQueue {
	public:
	RunQueue() : front_(0), back_(0) {
	// require power-of-two for fast masking
	eigen_assert((kSize & (kSize - 1)) == 0);
	eigen_assert(kSize > 2); // why would you do this?
	eigen_assert(kSize <= (64 << 10)); // leave enough space for counter
	for (unsigned i = 0; i < kSize; i++)
	array_[i].state.store(kEmpty, std::memory_order_relaxed);
	}

	~RunQueue() { eigen_assert(Size() == 0); }

	// PushFront inserts w at the beginning of the queue.
	// If queue is full returns w, otherwise returns default-constructed Work.
	Work PushFront(Work w) {
	unsigned front = front_.load(std::memory_order_relaxed);
	Elem* e = &array_[front & kMask];
	uint8_t s = e->state.load(std::memory_order_relaxed);
	if (s != kEmpty \|\|
	!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
	return w;
	front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
	e->w = std::move(w);
	e->state.store(kReady, std::memory_order_release);
	return Work();
	}

	// PopFront removes and returns the first element in the queue.
	// If the queue was empty returns default-constructed Work.
	Work PopFront() {
	unsigned front = front_.load(std::memory_order_relaxed);
	Elem* e = &array_[(front - 1) & kMask];
	uint8_t s = e->state.load(std::memory_order_relaxed);
	if (s != kReady \|\|
	!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
	return Work();
	Work w = std::move(e->w);
	e->state.store(kEmpty, std::memory_order_release);
	front = ((front - 1) & kMask2) \| (front & ~kMask2);
	front_.store(front, std::memory_order_relaxed);
	return w;
	}

	// PushBack adds w at the end of the queue.
	// If queue is full returns w, otherwise returns default-constructed Work.
	Work PushBack(Work w) {
	mutex_lock lock(mutex_);
	unsigned back = back_.load(std::memory_order_relaxed);
	Elem* e = &array_[(back - 1) & kMask];
	uint8_t s = e->state.load(std::memory_order_relaxed);
	if (s != kEmpty \|\|
	!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
	return w;
	back = ((back - 1) & kMask2) \| (back & ~kMask2);
	back_.store(back, std::memory_order_relaxed);
	e->w = std::move(w);
	e->state.store(kReady, std::memory_order_release);
	return Work();
	}

	// PopBack removes and returns the last elements in the queue.
	// Can fail spuriously.
	Work PopBack() {
	if (Empty()) return Work();
	mutex_lock lock(mutex_, std::try_to_lock);
	if (!lock) return Work();
	unsigned back = back_.load(std::memory_order_relaxed);
	Elem* e = &array_[back & kMask];
	uint8_t s = e->state.load(std::memory_order_relaxed);
	if (s != kReady \|\|
	!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
	return Work();
	Work w = std::move(e->w);
	e->state.store(kEmpty, std::memory_order_release);
	back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
	return w;
	}

	// PopBackHalf removes and returns half last elements in the queue.
	// Returns number of elements removed. But can also fail spuriously.
	unsigned PopBackHalf(std::vector<Work>* result) {
	if (Empty()) return 0;
	mutex_lock lock(mutex_, std::try_to_lock);
	if (!lock) return 0;
	unsigned back = back_.load(std::memory_order_relaxed);
	unsigned size = Size();
	unsigned mid = back;
	if (size > 1) mid = back + (size - 1) / 2;
	unsigned n = 0;
	unsigned start = 0;
	for (; static_cast<int>(mid - back) >= 0; mid--) {
	Elem* e = &array_[mid & kMask];
	uint8_t s = e->state.load(std::memory_order_relaxed);
	if (n == 0) {
	if (s != kReady \|\|
	!e->state.compare_exchange_strong(s, kBusy,
	std::memory_order_acquire))
	continue;
	start = mid;
	} else {
	// Note: no need to store temporal kBusy, we exclusively own these
	// elements.
	eigen_assert(s == kReady);
	}
	result->push_back(std::move(e->w));
	e->state.store(kEmpty, std::memory_order_release);
	n++;
	}
	if (n != 0)
	back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
	return n;
	}

	// Size returns current queue size.
	// Can be called by any thread at any time.
	unsigned Size() const {
	// Emptiness plays critical role in thread pool blocking. So we go to great
	// effort to not produce false positives (claim non-empty queue as empty).
	for (;;) {
	// Capture a consistent snapshot of front/tail.
	unsigned front = front_.load(std::memory_order_acquire);
	unsigned back = back_.load(std::memory_order_acquire);
	unsigned front1 = front_.load(std::memory_order_relaxed);
	if (front != front1) continue;
	int size = (front & kMask2) - (back & kMask2);
	// Fix overflow.
	if (size < 0) size += 2 * kSize;
	// Order of modification in push/pop is crafted to make the queue look
	// larger than it is during concurrent modifications. E.g. pop can
	// decrement size before the corresponding push has incremented it.
	// So the computed size can be up to kSize + 1, fix it.
	if (size > kSize) size = kSize;
	return size;
	}
	}

	// Empty tests whether container is empty.
	// Can be called by any thread at any time.
	bool Empty() const { return Size() == 0; }

	private:
	static const unsigned kMask = kSize - 1;
	static const unsigned kMask2 = (kSize << 1) - 1;
	struct Elem {
	std::atomic<uint8_t> state;
	Work w;
	};
	enum {
	kEmpty,
	kBusy,
	kReady,
	};
	mutex mutex_;
	// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
	// front/back, repsectively. The remaining bits contain modification counters
	// that are incremented on Push operations. This allows us to (1) distinguish
	// between empty and full conditions (if we would use log(kSize) bits for
	// position, these conditions would be indistinguishable); (2) obtain
	// consistent snapshot of front_/back_ for Size operation using the
	// modification counters.
	std::atomic<unsigned> front_;
	std::atomic<unsigned> back_;
	Elem array_[kSize];

	RunQueue(const RunQueue&) = delete;
	void operator=(const RunQueue&) = delete;
	};

	} // namespace Eigen

	#endif // EIGEN_CXX11_TENSOR_TENSOR_RUNQUEUE_H_