diff options
Diffstat (limited to 'src/chunklets/fastspin.c')
-rw-r--r-- | src/chunklets/fastspin.c | 299 |
1 files changed, 299 insertions, 0 deletions
diff --git a/src/chunklets/fastspin.c b/src/chunklets/fastspin.c new file mode 100644 index 0000000..bfaaf9b --- /dev/null +++ b/src/chunklets/fastspin.c @@ -0,0 +1,299 @@ +/* + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __cplusplus +#error This file should not be compiled as C++. It relies on C-specific \ +keywords and APIs which have syntactically different equivalents for C++. +#endif + +#include <stdatomic.h> + +#include "fastspin.h" + +_Static_assert(sizeof(int) == sizeof(_Atomic int), + "This library assumes that ints in memory can be treated as atomic"); +_Static_assert(_Alignof(int) == _Alignof(_Atomic int), + "This library assumes that atomic operations do not need over-alignment"); + +#if defined(__GNUC__) || defined(__clang__) || defined(__TINYC__) +#if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) || \ + defined(__mips__) // same asm syntax for pause +#define RELAX() __asm__ volatile ("pause" ::: "memory") +#elif defined(__arm__) || defined(__aarch64__) +#define RELAX() __asm__ volatile ("yield" ::: "memory") +#elif defined(__powerpc__) || defined(__ppc64__) +// POWER7 (2010) - older arches may be less efficient +#define RELAX() __asm__ volatile ("or 27, 27, 27" ::: "memory") +#endif +#elif defined(_MSC_VER) +#if defined(_M_ARM || _M_ARM64) +#define RELAX() __yield() +#else +void _mm_pause(); // don't pull in emmintrin.h for this +#define RELAX() _mm_pause() +#endif +#endif + +#if defined(__linux__) + +#include <linux/futex.h> +#include <sys/syscall.h> + +// some arches only have a _time64 variant. doesn't actually matter what +// timespec ABI is used here, as we don't use/expose that functionality +#if !defined(SYS_futex) && defined( SYS_futex_time64) +#define SYS_futex SYS_futex_time64 +#endif + +// glibc and musl have never managed and/or bothered to provide a futex wrapper +static inline void futex_wait(int *p, int val) { + syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); +} +static inline void futex_wakeall(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); +} + +#elif defined(__OpenBSD__) + +#include <sys/futex.h> + +// OpenBSD just emulates the Linux call but it still provides a wrapper! Yay! +static inline void futex_wait(int *p, int val) { + futex(p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); +} +static inline void futex_wakeall(int *p) { + futex(p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); +} + +#elif defined(__NetBSD__) + +#include <sys/futex.h> // for constants +#include <sys/syscall.h> +#include <unistd.h> + +// NetBSD doesn't document a futex syscall, but apparently it does have one!? +// Their own pthreads library still doesn't actually use it, go figure. Also, it +// takes an extra parameter for some reason. +static inline void futex_wait(int *p, int val) { + syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0, 0); +} +static inline void futex_wakeall(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0, 0); +} + +#elif defined(__FreeBSD__) + +#include <sys/types.h> // ugh still no IWYU everywhere. maybe next year +#include <sys/umtx.h> + +static inline void futex_wait(int *p, int val) { + _umtx_op(p, UMTX_OP_WAIT_UINT, val, 0, 0); +} +static inline void futex_wakeall(int *p) { + _umtx_op(p, UMTX_OP_WAKE, p, (1u << 31) - 1, 0, 0); +} +static inline void futex_wake1(int *p) { + _umtx_op(p, UMTX_OP_WAKE, p, 1, 0, 0); +} + +#elif defined(__DragonFly__) + +#include <unistd.h> + +// An actually good interface. Thank you Matt, very cool. +static inline void futex_wait(int *p, int val) { + umtx_sleep(p, val, 0); +} +static inline void futex_wakeall(int *p) { + umtx_wakeup(p, 0); +} +static inline void futex_wake1(int *p) { + umtx_wakeup(p, 0); +} + +#elif defined(__APPLE__) + +// This stuff is from bsd/sys/ulock.h in XNU. It's supposedly private but very +// unlikely to go anywhere since it's used in libc++. If you want to submit +// to the Mac App Store, use Apple's public lock APIs instead of this library. +extern int __ulock_wait(unsigned int op, void *addr, unsigned long long val, + unsigned int timeout); +extern int __ulock_wake(unsigned int op, void *addr, unsigned long long val); + +#define UL_COMPARE_AND_WAIT 1 +#define ULF_WAKE_ALL 0x100 +#define ULF_NO_ERRNO 0x1000000 + +static inline void futex_wait(int *p, int val) { + __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, p, val, 0); +} +static inline void futex_wakeall(int *p) { + __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, uaddr, 0); +} +static inline void futex_wake1(int *p) { + __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, uaddr, 0); +} + +#elif defined(_WIN32) + +#ifdef _WIN64 +typedef unsigned long long usize; +#else +typedef unsigned long usize; +#endif + +// There's no header for these because NTAPI. Plus Windows.h sucks anyway. +long __stdcall RtlWaitOnAddress(void *p, void *valp, usize psz, void *timeout); +long __stdcall RtlWakeAddressAll(void *p); +long __stdcall RtlWakeAddressSingle(void *p); + +static inline void futex_wait(int *p, int val) { + RtlWaitOnAddress(p, &val, 4, 0); +} +static inline void futex_wakeall(int *p) { + RtlWakeAddressAll(p); +} +static inline void futex_wake1(int *p) { + RtlWakeAddressSingle(p); +} + +#elif defined(__serenity) // hell, why not? + +#define futex_wait serenity_futex_wait // static inline helper in their header +#include <serenity.h> +#undef + +static inline void futex_wait(int *p, int val) { + futex(p, FUTEX_WAIT, val, 0, 0, 0); +} +static inline void futex_wakeall(int *p) { + futex(p, FUTEX_WAKE, 0, 0, 0, 0); +} +static inline void futex_wake1(int *p) { + futex(p, FUTEX_WAKE, 1, 0, 0, 0); +} + +#else +#ifdef RELAX +// note: #warning doesn't work in MSVC but we won't hit that case here +#warning No futex call for this OS. Falling back on pure spinlock. \ +Performance will suffer during contention. +#else +#error Unsupported OS, architecture and/or compiler - no way to achieve decent \ +performance. Need either CPU spinlock hints or futexes, ideally both. +#endif +#define NO_FUTEX +#endif + +#ifndef RELAX +#define RELAX do; while (0) // avoid having to #ifdef RELAX everywhere now +#endif + +void fastspin_raise(volatile int *p_, int val) { + _Atomic int *p = (_Atomic int *)p_; +#ifdef NO_FUTEX + atomic_store_explicit(p, val, memory_order_release); +#else + // for the futex implementation, try to avoid the wake syscall if we know + // nothing had to sleep + if (atomic_exchange_explicit(p, val, memory_order_release)) { + futex_wakeall((int *)p); + } +#endif +} + +int fastspin_wait(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; + int x = atomic_load_explicit(p, memory_order_acquire); +#ifdef NO_FUTEX + if (x) return x; + // only need acquire ordering once, then can avoid cache coherence overhead. + do { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + } while (x); +#else + if (x > 0) return x; + if (!x) { + for (int c = 1000; c; --c) { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + if (x > 0) return x; + } + // cmpxchg a negative (invalid) value. this will fail in two cases: + // 1. someone else already cmpxchg'd: the futex_wait() will work fine + // 2. raise() was already called: the futex_wait() will return instantly + atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, + memory_order_acq_rel, memory_order_relaxed); + futex_wait((int *)p, -1); + } + return atomic_load_explicit(p, memory_order_relaxed); +#endif +} + +void fastspin_lock(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; + int x; + for (;;) { +#ifdef NO_FUTEX + if (!atomic_exchange_explicit(p, 1, memory_order_acquire)) return; + do { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + } while (x); +#else +top: x = 0; + if (atomic_compare_exchange_weak_explicit(p, &x, 1, + memory_order_acquire, memory_order_relaxed)) { + return; + } + if (x) { + for (int c = 1000; c; --c) { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + // note: top sets x to 0 unnecessarily but clang actually does + // that regardless(!), probably to break loop-carried dependency + if (!x) goto top; + } + atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, + memory_order_acq_rel, memory_order_relaxed); + futex_wait((int *)p, -1); // (then spin once more to avoid spuria) + } +#endif + } +} + +void fastspin_unlock(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; +#ifdef NO_FUTEX + atomic_store_explicit((_Atomic int *)p, 0, memory_order_release); +#else + if (atomic_exchange_explicit(p, 0, memory_order_release) < 0) { + futex_wake1((int *)p); + } +#endif +} + +// vi: sw=4 ts=4 noet tw=80 cc=80 |