util/udma_barrier.h - linux-rdma/rdma-core - Git at Google

 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #ifndef __UTIL_UDMA_BARRIER_H
 #define __UTIL_UDMA_BARRIER_H

 #include <pthread.h>

 /* Barriers for DMA.

    These barriers are expliclty only for use with user DMA operations. If you
    are looking for barriers to use with cache-coherent multi-threaded
    consitency then look in stdatomic.h. If you need both kinds of synchronicity
    for the same address then use an atomic operation followed by one
    of these barriers.

    When reasoning about these barriers there are two objects:
      - CPU attached address space (the CPU memory could be a range of things:
        cached/uncached/non-temporal CPU DRAM, uncached MMIO space in another
        device, pMEM). Generally speaking the ordering is only relative
        to the local CPU's view of the system. Eg if the local CPU
        is not guaranteed to see a write from another CPU then it is also
        OK for the DMA device to also not see the write after the barrier.
      - A DMA initiator on a bus. For instance a PCI-E device issuing
        MemRd/MemWr TLPs.

    The ordering guarantee is always stated between those two streams. Eg what
    happens if a MemRd TLP is sent in via PCI-E relative to a CPU WRITE to the
    same memory location.

    The providers have a very regular and predictable use of these barriers,
    to make things very clear each narrow use is given a name and the proper
    name should be used in the provider as a form of documentation.
 */

 /* Ensure that the device's view of memory matches the CPU's view of memory.
    This should be placed before any MMIO store that could trigger the device
    to begin doing DMA, such as a device doorbell ring.

    eg
     *dma_buf = 1;
     udma_to_device_barrier();
     mmio_write(DO_DMA_REG, dma_buf);
    Must ensure that the device sees the '1'.

    This is required to fence writes created by the libibverbs user. Those
    writes could be to any CPU mapped memory object with any cachability mode.

    NOTE: x86 has historically used a weaker semantic for this barrier, and
    only fenced normal stores to normal memory. libibverbs users using other
    memory types or non-temporal stores are required to use SFENCE in their own
    code prior to calling verbs to start a DMA.
 */
 #if defined(__i386__)
 #define udma_to_device_barrier() asm volatile("" ::: "memory")
 #elif defined(__x86_64__)
 #define udma_to_device_barrier() asm volatile("" ::: "memory")
 #elif defined(__PPC64__)
 #define udma_to_device_barrier() asm volatile("sync" ::: "memory")
 #elif defined(__PPC__)
 #define udma_to_device_barrier() asm volatile("sync" ::: "memory")
 #elif defined(__ia64__)
 #define udma_to_device_barrier() asm volatile("mf" ::: "memory")
 #elif defined(__sparc_v9__)
 #define udma_to_device_barrier() asm volatile("membar #StoreStore" ::: "memory")
 #elif defined(__aarch64__)
 #define udma_to_device_barrier() asm volatile("dmb oshst" ::: "memory")
 #elif defined(__sparc__) || defined(__s390x__)
 #define udma_to_device_barrier() asm volatile("" ::: "memory")
 #elif defined(__loongarch__)
 #define udma_to_device_barrier() asm volatile("dbar 0" ::: "memory")
 #elif defined(__riscv)
 #define udma_to_device_barrier() asm volatile("fence ow,ow" ::: "memory")
 #elif defined(__mips__)
 #define udma_to_device_barrier() asm volatile("sync" ::: "memory")
 #else
 #error No architecture specific memory barrier defines found!
 #endif

 /* Ensure that all ordered stores from the device are observable from the
    CPU. This only makes sense after something that observes an ordered store
    from the device - eg by reading a MMIO register or seeing that CPU memory is
    updated.

    This guarantees that all reads that follow the barrier see the ordered
    stores that preceded the observation.

    For instance, this would be used after testing a valid bit in a memory
    that is a DMA target, to ensure that the following reads see the
    data written before the MemWr TLP that set the valid bit.
 */
 #if defined(__i386__)
 #define udma_from_device_barrier() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
 #elif defined(__x86_64__)
 #define udma_from_device_barrier() asm volatile("lfence" ::: "memory")
 #elif defined(__PPC64__)
 #define udma_from_device_barrier() asm volatile("lwsync" ::: "memory")
 #elif defined(__PPC__)
 #define udma_from_device_barrier() asm volatile("sync" ::: "memory")
 #elif defined(__ia64__)
 #define udma_from_device_barrier() asm volatile("mf" ::: "memory")
 #elif defined(__sparc_v9__)
 #define udma_from_device_barrier() asm volatile("membar #LoadLoad" ::: "memory")
 #elif defined(__aarch64__)
 #define udma_from_device_barrier() asm volatile("dmb oshld" ::: "memory")
 #elif defined(__sparc__) || defined(__s390x__)
 #define udma_from_device_barrier() asm volatile("" ::: "memory")
 #elif defined(__loongarch__)
 #define udma_from_device_barrier() asm volatile("dbar 0" ::: "memory")
 #elif defined(__riscv)
 #define udma_from_device_barrier() asm volatile("fence ir,ir" ::: "memory")
 #elif defined(__mips__)
 #define udma_from_device_barrier() asm volatile("sync" ::: "memory")
 #else
 #error No architecture specific memory barrier defines found!
 #endif

 /* Order writes to CPU memory so that a DMA device cannot view writes after
    the barrier without also seeing all writes before the barrier. This does
    not guarantee any writes are visible to DMA.

    This would be used in cases where a DMA buffer might have a valid bit and
    data, this barrier is placed after writing the data but before writing the
    valid bit to ensure the DMA device cannot observe a set valid bit with
    unwritten data.

    Compared to udma_to_device_barrier() this barrier is not required to fence
    anything but normal stores to normal malloc memory. Usage should be:

    write_wqe
       udma_to_device_barrier();    // Get user memory ready for DMA
       wqe->addr = ...;
       wqe->flags = ...;
       udma_ordering_write_barrier();  // Guarantee WQE written in order
       wqe->valid = 1;
 */
 #define udma_ordering_write_barrier() udma_to_device_barrier()

 /* Promptly flush writes to MMIO Write Cominbing memory.
    This should be used after a write to WC memory. This is both a barrier
    and a hint to the CPU to flush any buffers to reduce latency to TLP
    generation.

    This is not required to have any effect on CPU memory.

    If done while holding a lock then the ordering of MMIO writes across CPUs
    must be guaranteed to follow the natural ordering implied by the lock.

    This must also act as a barrier that prevents write combining, eg
      *wc_mem = 1;
      mmio_flush_writes();
      *wc_mem = 2;
    Must always produce two MemWr TLPs, '1' and '2'. Without the barrier
    the CPU is allowed to produce a single TLP '2'.

    Note that there is no order guarantee for writes to WC memory without
    barriers.

    This is intended to be used in conjunction with WC memory to generate large
    PCI-E MemWr TLPs from the CPU.
 */
 #if defined(__i386__)
 #define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
 #elif defined(__x86_64__)
 #define mmio_flush_writes() asm volatile("sfence" ::: "memory")
 #elif defined(__PPC64__)
 #define mmio_flush_writes() asm volatile("sync" ::: "memory")
 #elif defined(__PPC__)
 #define mmio_flush_writes() asm volatile("sync" ::: "memory")
 #elif defined(__ia64__)
 #define mmio_flush_writes() asm volatile("fwb" ::: "memory")
 #elif defined(__sparc_v9__)
 #define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory")
 #elif defined(__aarch64__)
 #define mmio_flush_writes() asm volatile("dsb st" ::: "memory");
 #elif defined(__sparc__)
 #define mmio_flush_writes() asm volatile("" ::: "memory")
 #elif defined(__loongarch__)
 #define mmio_flush_writes() asm volatile("dbar 0" ::: "memory")
 #elif defined(__riscv)
 #define mmio_flush_writes() asm volatile("fence ow,ow" ::: "memory")
 #elif defined(__s390x__)
 #include "s390_mmio_insn.h"
 #define mmio_flush_writes() s390_pciwb()
 #elif defined(__mips__)
 #define mmio_flush_writes() asm volatile("sync" ::: "memory")
 #else
 #error No architecture specific memory barrier defines found!
 #endif

 /* Prevent WC writes from being re-ordered relative to other MMIO
    writes. This should be used before a write to WC memory.

    This must act as a barrier to prevent write re-ordering from different
    memory types:
      *mmio_mem = 1;
      mmio_flush_writes();
      *wc_mem = 2;
    Must always produce a TLP '1' followed by '2'.

    This barrier implies udma_to_device_barrier()

    This is intended to be used in conjunction with WC memory to generate large
    PCI-E MemWr TLPs from the CPU.
 */
 #define mmio_wc_start() mmio_flush_writes()

 /* Keep MMIO writes in order.
    Currently we lack writel macros that universally guarantee MMIO
    writes happen in order, like the kernel does. Even worse many
    providers haphazardly open code writes to MMIO memory omitting even
    volatile.

    Until this can be fixed with a proper writel macro, this barrier
    is a stand in to indicate places where MMIO writes should be switched
    to some future writel.
 */
 #define mmio_ordered_writes_hack() mmio_flush_writes()

 /* Write Combining Spinlock primitive

    Any access to a multi-value WC region must ensure that multiple cpus do not
    write to the same values concurrently, these macros make that
    straightforward and efficient if the choosen exclusion is a spinlock.

    The spinlock guarantees that the WC writes issued within the critical
    section are made visible as TLP to the device. The TLP must be seen by the
    device strictly in the order that the spinlocks are acquired, and combining
    WC writes between different sections is not permitted.

    Use of these macros allow the fencing inside the spinlock to be combined
    with the fencing required for DMA.
  */
 static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
 {
 	pthread_spin_lock(lock);
 #if !defined(__i386__) && !defined(__x86_64__)
 	/* For x86 the serialization within the spin lock is enough to
 	 * strongly order WC and other memory types. */
 	mmio_wc_start();
 #endif
 }

 static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
 {
 	/* It is possible that on x86 the atomic in the lock is strong enough
 	 * to force-flush the WC buffers quickly, and this SFENCE can be
 	 * omitted too. */
 	mmio_flush_writes();
 	pthread_spin_unlock(lock);
 }

 #endif
	/*
	* Copyright (c) 2005 Topspin Communications. All rights reserved.
	*
	* This software is available to you under a choice of one of two
	* licenses. You may choose to be licensed under the terms of the GNU
	* General Public License (GPL) Version 2, available from the file
	* COPYING in the main directory of this source tree, or the
	* OpenIB.org BSD license below:
	*
	* Redistribution and use in source and binary forms, with or
	* without modification, are permitted provided that the following
	* conditions are met:
	*
	* - Redistributions of source code must retain the above
	* copyright notice, this list of conditions and the following
	* disclaimer.
	*
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials
	* provided with the distribution.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#ifndef __UTIL_UDMA_BARRIER_H
	#define __UTIL_UDMA_BARRIER_H

	#include <pthread.h>

	/* Barriers for DMA.

	These barriers are expliclty only for use with user DMA operations. If you
	are looking for barriers to use with cache-coherent multi-threaded
	consitency then look in stdatomic.h. If you need both kinds of synchronicity
	for the same address then use an atomic operation followed by one
	of these barriers.

	When reasoning about these barriers there are two objects:
	- CPU attached address space (the CPU memory could be a range of things:
	cached/uncached/non-temporal CPU DRAM, uncached MMIO space in another
	device, pMEM). Generally speaking the ordering is only relative
	to the local CPU's view of the system. Eg if the local CPU
	is not guaranteed to see a write from another CPU then it is also
	OK for the DMA device to also not see the write after the barrier.
	- A DMA initiator on a bus. For instance a PCI-E device issuing
	MemRd/MemWr TLPs.

	The ordering guarantee is always stated between those two streams. Eg what
	happens if a MemRd TLP is sent in via PCI-E relative to a CPU WRITE to the
	same memory location.

	The providers have a very regular and predictable use of these barriers,
	to make things very clear each narrow use is given a name and the proper
	name should be used in the provider as a form of documentation.
	*/

	/* Ensure that the device's view of memory matches the CPU's view of memory.
	This should be placed before any MMIO store that could trigger the device
	to begin doing DMA, such as a device doorbell ring.

	eg
	*dma_buf = 1;
	udma_to_device_barrier();
	mmio_write(DO_DMA_REG, dma_buf);
	Must ensure that the device sees the '1'.

	This is required to fence writes created by the libibverbs user. Those
	writes could be to any CPU mapped memory object with any cachability mode.

	NOTE: x86 has historically used a weaker semantic for this barrier, and
	only fenced normal stores to normal memory. libibverbs users using other
	memory types or non-temporal stores are required to use SFENCE in their own
	code prior to calling verbs to start a DMA.
	*/
	#if defined(__i386__)
	#define udma_to_device_barrier() asm volatile("" ::: "memory")
	#elif defined(__x86_64__)
	#define udma_to_device_barrier() asm volatile("" ::: "memory")
	#elif defined(__PPC64__)
	#define udma_to_device_barrier() asm volatile("sync" ::: "memory")
	#elif defined(__PPC__)
	#define udma_to_device_barrier() asm volatile("sync" ::: "memory")
	#elif defined(__ia64__)
	#define udma_to_device_barrier() asm volatile("mf" ::: "memory")
	#elif defined(__sparc_v9__)
	#define udma_to_device_barrier() asm volatile("membar #StoreStore" ::: "memory")
	#elif defined(__aarch64__)
	#define udma_to_device_barrier() asm volatile("dmb oshst" ::: "memory")
	#elif defined(__sparc__) \|\| defined(__s390x__)
	#define udma_to_device_barrier() asm volatile("" ::: "memory")
	#elif defined(__loongarch__)
	#define udma_to_device_barrier() asm volatile("dbar 0" ::: "memory")
	#elif defined(__riscv)
	#define udma_to_device_barrier() asm volatile("fence ow,ow" ::: "memory")
	#elif defined(__mips__)
	#define udma_to_device_barrier() asm volatile("sync" ::: "memory")
	#else
	#error No architecture specific memory barrier defines found!
	#endif

	/* Ensure that all ordered stores from the device are observable from the
	CPU. This only makes sense after something that observes an ordered store
	from the device - eg by reading a MMIO register or seeing that CPU memory is
	updated.

	This guarantees that all reads that follow the barrier see the ordered
	stores that preceded the observation.

	For instance, this would be used after testing a valid bit in a memory
	that is a DMA target, to ensure that the following reads see the
	data written before the MemWr TLP that set the valid bit.
	*/
	#if defined(__i386__)
	#define udma_from_device_barrier() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
	#elif defined(__x86_64__)
	#define udma_from_device_barrier() asm volatile("lfence" ::: "memory")
	#elif defined(__PPC64__)
	#define udma_from_device_barrier() asm volatile("lwsync" ::: "memory")
	#elif defined(__PPC__)
	#define udma_from_device_barrier() asm volatile("sync" ::: "memory")
	#elif defined(__ia64__)
	#define udma_from_device_barrier() asm volatile("mf" ::: "memory")
	#elif defined(__sparc_v9__)
	#define udma_from_device_barrier() asm volatile("membar #LoadLoad" ::: "memory")
	#elif defined(__aarch64__)
	#define udma_from_device_barrier() asm volatile("dmb oshld" ::: "memory")
	#elif defined(__sparc__) \|\| defined(__s390x__)
	#define udma_from_device_barrier() asm volatile("" ::: "memory")
	#elif defined(__loongarch__)
	#define udma_from_device_barrier() asm volatile("dbar 0" ::: "memory")
	#elif defined(__riscv)
	#define udma_from_device_barrier() asm volatile("fence ir,ir" ::: "memory")
	#elif defined(__mips__)
	#define udma_from_device_barrier() asm volatile("sync" ::: "memory")
	#else
	#error No architecture specific memory barrier defines found!
	#endif

	/* Order writes to CPU memory so that a DMA device cannot view writes after
	the barrier without also seeing all writes before the barrier. This does
	not guarantee any writes are visible to DMA.

	This would be used in cases where a DMA buffer might have a valid bit and
	data, this barrier is placed after writing the data but before writing the
	valid bit to ensure the DMA device cannot observe a set valid bit with
	unwritten data.

	Compared to udma_to_device_barrier() this barrier is not required to fence
	anything but normal stores to normal malloc memory. Usage should be:

	write_wqe
	udma_to_device_barrier(); // Get user memory ready for DMA
	wqe->addr = ...;
	wqe->flags = ...;
	udma_ordering_write_barrier(); // Guarantee WQE written in order
	wqe->valid = 1;
	*/
	#define udma_ordering_write_barrier() udma_to_device_barrier()

	/* Promptly flush writes to MMIO Write Cominbing memory.
	This should be used after a write to WC memory. This is both a barrier
	and a hint to the CPU to flush any buffers to reduce latency to TLP
	generation.

	This is not required to have any effect on CPU memory.

	If done while holding a lock then the ordering of MMIO writes across CPUs
	must be guaranteed to follow the natural ordering implied by the lock.

	This must also act as a barrier that prevents write combining, eg
	*wc_mem = 1;
	mmio_flush_writes();
	*wc_mem = 2;
	Must always produce two MemWr TLPs, '1' and '2'. Without the barrier
	the CPU is allowed to produce a single TLP '2'.

	Note that there is no order guarantee for writes to WC memory without
	barriers.

	This is intended to be used in conjunction with WC memory to generate large
	PCI-E MemWr TLPs from the CPU.
	*/
	#if defined(__i386__)
	#define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
	#elif defined(__x86_64__)
	#define mmio_flush_writes() asm volatile("sfence" ::: "memory")
	#elif defined(__PPC64__)
	#define mmio_flush_writes() asm volatile("sync" ::: "memory")
	#elif defined(__PPC__)
	#define mmio_flush_writes() asm volatile("sync" ::: "memory")
	#elif defined(__ia64__)
	#define mmio_flush_writes() asm volatile("fwb" ::: "memory")
	#elif defined(__sparc_v9__)
	#define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory")
	#elif defined(__aarch64__)
	#define mmio_flush_writes() asm volatile("dsb st" ::: "memory");
	#elif defined(__sparc__)
	#define mmio_flush_writes() asm volatile("" ::: "memory")
	#elif defined(__loongarch__)
	#define mmio_flush_writes() asm volatile("dbar 0" ::: "memory")
	#elif defined(__riscv)
	#define mmio_flush_writes() asm volatile("fence ow,ow" ::: "memory")
	#elif defined(__s390x__)
	#include "s390_mmio_insn.h"
	#define mmio_flush_writes() s390_pciwb()
	#elif defined(__mips__)
	#define mmio_flush_writes() asm volatile("sync" ::: "memory")
	#else
	#error No architecture specific memory barrier defines found!
	#endif

	/* Prevent WC writes from being re-ordered relative to other MMIO
	writes. This should be used before a write to WC memory.

	This must act as a barrier to prevent write re-ordering from different
	memory types:
	*mmio_mem = 1;
	mmio_flush_writes();
	*wc_mem = 2;
	Must always produce a TLP '1' followed by '2'.

	This barrier implies udma_to_device_barrier()

	This is intended to be used in conjunction with WC memory to generate large
	PCI-E MemWr TLPs from the CPU.
	*/
	#define mmio_wc_start() mmio_flush_writes()

	/* Keep MMIO writes in order.
	Currently we lack writel macros that universally guarantee MMIO
	writes happen in order, like the kernel does. Even worse many
	providers haphazardly open code writes to MMIO memory omitting even
	volatile.

	Until this can be fixed with a proper writel macro, this barrier
	is a stand in to indicate places where MMIO writes should be switched
	to some future writel.
	*/
	#define mmio_ordered_writes_hack() mmio_flush_writes()

	/* Write Combining Spinlock primitive

	Any access to a multi-value WC region must ensure that multiple cpus do not
	write to the same values concurrently, these macros make that
	straightforward and efficient if the choosen exclusion is a spinlock.

	The spinlock guarantees that the WC writes issued within the critical
	section are made visible as TLP to the device. The TLP must be seen by the
	device strictly in the order that the spinlocks are acquired, and combining
	WC writes between different sections is not permitted.

	Use of these macros allow the fencing inside the spinlock to be combined
	with the fencing required for DMA.
	*/
	static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
	{
	pthread_spin_lock(lock);
	#if !defined(__i386__) && !defined(__x86_64__)
	/* For x86 the serialization within the spin lock is enough to
	* strongly order WC and other memory types. */
	mmio_wc_start();
	#endif
	}

	static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
	{
	/* It is possible that on x86 the atomic in the lock is strong enough
	* to force-flush the WC buffers quickly, and this SFENCE can be
	* omitted too. */
	mmio_flush_writes();
	pthread_spin_unlock(lock);
	}

	#endif