/* * Copyright © 2023 Michael Smith * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THIS SOFTWARE. */ #ifdef __cplusplus #error This file should not be compiled as C++. It relies on C-specific \ keywords and APIs which have syntactically different equivalents for C++. #endif #include #include "fastspin.h" _Static_assert(sizeof(int) == sizeof(_Atomic int), "This library assumes that ints in memory can be treated as atomic"); _Static_assert(_Alignof(int) == _Alignof(_Atomic int), "This library assumes that atomic operations do not need over-alignment"); #if defined(__GNUC__) || defined(__clang__) || defined(__TINYC__) #if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) || \ defined(__mips__) // same asm syntax for pause #define RELAX() __asm__ volatile ("pause" ::: "memory") #elif defined(__arm__) || defined(__aarch64__) #define RELAX() __asm__ volatile ("yield" ::: "memory") #elif defined(__powerpc__) || defined(__ppc64__) // POWER7 (2010) - older arches may be less efficient #define RELAX() __asm__ volatile ("or 27, 27, 27" ::: "memory") #endif #elif defined(_MSC_VER) #if defined(_M_ARM || _M_ARM64) #define RELAX() __yield() #else void _mm_pause(); // don't pull in emmintrin.h for this #define RELAX() _mm_pause() #endif #endif #if defined(__linux__) #include #include #include // some arches only have a _time64 variant. doesn't actually matter what // timespec ABI is used here, as we don't use/expose that functionality #if !defined(SYS_futex) && defined( SYS_futex_time64) #define SYS_futex SYS_futex_time64 #endif // glibc and musl have never managed and/or bothered to provide a futex wrapper static inline void futex_wait(int *p, int val) { syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); } static inline void futex_wakeall(int *p) { syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); } static inline void futex_wake1(int *p) { syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); } #elif defined(__OpenBSD__) #include // OpenBSD just emulates the Linux call but it still provides a wrapper! Yay! static inline void futex_wait(int *p, int val) { futex(p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); } static inline void futex_wakeall(int *p) { futex(p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); } static inline void futex_wake1(int *p) { syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); } #elif defined(__NetBSD__) #include // for constants #include #include // NetBSD doesn't document a futex syscall, but apparently it does have one!? // Their own pthreads library still doesn't actually use it, go figure. Also, it // takes an extra parameter for some reason. static inline void futex_wait(int *p, int val) { syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0, 0); } static inline void futex_wakeall(int *p) { syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0, 0); } static inline void futex_wake1(int *p) { syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0, 0); } #elif defined(__FreeBSD__) #include // ugh still no IWYU everywhere. maybe next year #include static inline void futex_wait(int *p, int val) { _umtx_op(p, UMTX_OP_WAIT_UINT, val, 0, 0); } static inline void futex_wakeall(int *p) { _umtx_op(p, UMTX_OP_WAKE, p, (1u << 31) - 1, 0, 0); } static inline void futex_wake1(int *p) { _umtx_op(p, UMTX_OP_WAKE, p, 1, 0, 0); } #elif defined(__DragonFly__) #include // An actually good interface. Thank you Matt, very cool. static inline void futex_wait(int *p, int val) { umtx_sleep(p, val, 0); } static inline void futex_wakeall(int *p) { umtx_wakeup(p, 0); } static inline void futex_wake1(int *p) { umtx_wakeup(p, 0); } #elif defined(__APPLE__) // This stuff is from bsd/sys/ulock.h in XNU. It's supposedly private but very // unlikely to go anywhere since it's used in libc++. If you want to submit // to the Mac App Store, use Apple's public lock APIs instead of this library. extern int __ulock_wait(unsigned int op, void *addr, unsigned long long val, unsigned int timeout); extern int __ulock_wake(unsigned int op, void *addr, unsigned long long val); #define UL_COMPARE_AND_WAIT 1 #define ULF_WAKE_ALL 0x100 #define ULF_NO_ERRNO 0x1000000 static inline void futex_wait(int *p, int val) { __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, p, val, 0); } static inline void futex_wakeall(int *p) { __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, uaddr, 0); } static inline void futex_wake1(int *p) { __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, uaddr, 0); } #elif defined(_WIN32) #ifdef _WIN64 typedef unsigned long long usize; #else typedef unsigned long usize; #endif // There's no header for these because NTAPI. Plus Windows.h sucks anyway. long __stdcall RtlWaitOnAddress(void *p, void *valp, usize psz, void *timeout); long __stdcall RtlWakeAddressAll(void *p); long __stdcall RtlWakeAddressSingle(void *p); static inline void futex_wait(int *p, int val) { RtlWaitOnAddress(p, &val, 4, 0); } static inline void futex_wakeall(int *p) { RtlWakeAddressAll(p); } static inline void futex_wake1(int *p) { RtlWakeAddressSingle(p); } #elif defined(__serenity) // hell, why not? #define futex_wait serenity_futex_wait // static inline helper in their header #include #undef static inline void futex_wait(int *p, int val) { futex(p, FUTEX_WAIT, val, 0, 0, 0); } static inline void futex_wakeall(int *p) { futex(p, FUTEX_WAKE, 0, 0, 0, 0); } static inline void futex_wake1(int *p) { futex(p, FUTEX_WAKE, 1, 0, 0, 0); } #else #ifdef RELAX // note: #warning doesn't work in MSVC but we won't hit that case here #warning No futex call for this OS. Falling back on pure spinlock. \ Performance will suffer during contention. #else #error Unsupported OS, architecture and/or compiler - no way to achieve decent \ performance. Need either CPU spinlock hints or futexes, ideally both. #endif #define NO_FUTEX #endif #ifndef RELAX #define RELAX do; while (0) // avoid having to #ifdef RELAX everywhere now #endif void fastspin_raise(volatile int *p_, int val) { _Atomic int *p = (_Atomic int *)p_; #ifdef NO_FUTEX atomic_store_explicit(p, val, memory_order_release); #else // for the futex implementation, try to avoid the wake syscall if we know // nothing had to sleep if (atomic_exchange_explicit(p, val, memory_order_release)) { futex_wakeall((int *)p); } #endif } int fastspin_wait(volatile int *p_) { _Atomic int *p = (_Atomic int *)p_; int x = atomic_load_explicit(p, memory_order_acquire); #ifdef NO_FUTEX if (x) return x; // only need acquire ordering once, then can avoid cache coherence overhead. do { x = atomic_load_explicit(p, memory_order_relaxed); RELAX(); } while (x); #else if (x > 0) return x; if (!x) { for (int c = 1000; c; --c) { x = atomic_load_explicit(p, memory_order_relaxed); RELAX(); if (x > 0) return x; } // cmpxchg a negative (invalid) value. this will fail in two cases: // 1. someone else already cmpxchg'd: the futex_wait() will work fine // 2. raise() was already called: the futex_wait() will return instantly atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, memory_order_acq_rel, memory_order_relaxed); futex_wait((int *)p, -1); } return atomic_load_explicit(p, memory_order_relaxed); #endif } void fastspin_lock(volatile int *p_) { _Atomic int *p = (_Atomic int *)p_; int x; for (;;) { #ifdef NO_FUTEX if (!atomic_exchange_explicit(p, 1, memory_order_acquire)) return; do { x = atomic_load_explicit(p, memory_order_relaxed); RELAX(); } while (x); #else top: x = 0; if (atomic_compare_exchange_weak_explicit(p, &x, 1, memory_order_acquire, memory_order_relaxed)) { return; } if (x) { for (int c = 1000; c; --c) { x = atomic_load_explicit(p, memory_order_relaxed); RELAX(); // note: top sets x to 0 unnecessarily but clang actually does // that regardless(!), probably to break loop-carried dependency if (!x) goto top; } atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, memory_order_acq_rel, memory_order_relaxed); futex_wait((int *)p, -1); // (then spin once more to avoid spuria) } #endif } } void fastspin_unlock(volatile int *p_) { _Atomic int *p = (_Atomic int *)p_; #ifdef NO_FUTEX atomic_store_explicit((_Atomic int *)p, 0, memory_order_release); #else if (atomic_exchange_explicit(p, 0, memory_order_release) < 0) { futex_wake1((int *)p); } #endif } // vi: sw=4 ts=4 noet tw=80 cc=80