1 files changed, 299 insertions, 0 deletions
diff --git a/src/chunklets/fastspin.c b/src/chunklets/fastspin.c
new file mode 100644
index 0000000..bfaaf9b
--- /dev/null
+++ b/src/chunklets/fastspin.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+ * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+ * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef __cplusplus
+#error This file should not be compiled as C++. It relies on C-specific \
+keywords and APIs which have syntactically different equivalents for C++.
+#endif
+
+#include <stdatomic.h>
+
+#include "fastspin.h"
+
+_Static_assert(sizeof(int) == sizeof(_Atomic int),
+	"This library assumes that ints in memory can be treated as atomic");
+_Static_assert(_Alignof(int) == _Alignof(_Atomic int),
+	"This library assumes that atomic operations do not need over-alignment");
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__TINYC__)
+#if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) || \
+		defined(__mips__) // same asm syntax for pause
+#define RELAX() __asm__ volatile ("pause" ::: "memory")
+#elif defined(__arm__) || defined(__aarch64__)
+#define RELAX() __asm__ volatile ("yield" ::: "memory")
+#elif defined(__powerpc__) || defined(__ppc64__)
+// POWER7 (2010) - older arches may be less efficient
+#define RELAX() __asm__ volatile ("or 27, 27, 27" ::: "memory")
+#endif
+#elif defined(_MSC_VER)
+#if defined(_M_ARM || _M_ARM64)
+#define RELAX() __yield()
+#else
+void _mm_pause(); // don't pull in emmintrin.h for this
+#define RELAX() _mm_pause()
+#endif
+#endif
+
+#if defined(__linux__)
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+
+// some arches only have a _time64 variant. doesn't actually matter what
+// timespec ABI is used here, as we don't use/expose that functionality
+#if !defined(SYS_futex) && defined( SYS_futex_time64)
+#define SYS_futex SYS_futex_time64
+#endif
+
+// glibc and musl have never managed and/or bothered to provide a futex wrapper
+static inline void futex_wait(int *p, int val) {
+	syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0);
+}
+static inline void futex_wakeall(int *p) {
+	syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0);
+}
+static inline void futex_wake1(int *p) {
+	syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0);
+}
+
+#elif defined(__OpenBSD__)
+
+#include <sys/futex.h>
+
+// OpenBSD just emulates the Linux call but it still provides a wrapper! Yay!
+static inline void futex_wait(int *p, int val) {
+	futex(p, FUTEX_WAIT, val, (void *)0, (void *)0, 0);
+}
+static inline void futex_wakeall(int *p) {
+	futex(p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0);
+}
+static inline void futex_wake1(int *p) {
+	syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0);
+}
+
+#elif defined(__NetBSD__)
+
+#include <sys/futex.h> // for constants
+#include <sys/syscall.h>
+#include <unistd.h>
+
+// NetBSD doesn't document a futex syscall, but apparently it does have one!?
+// Their own pthreads library still doesn't actually use it, go figure. Also, it
+// takes an extra parameter for some reason.
+static inline void futex_wait(int *p, int val) {
+	syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0, 0);
+}
+static inline void futex_wakeall(int *p) {
+	syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0, 0);
+}
+static inline void futex_wake1(int *p) {
+	syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0, 0);
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/types.h> // ugh still no IWYU everywhere. maybe next year
+#include <sys/umtx.h>
+
+static inline void futex_wait(int *p, int val) {
+	_umtx_op(p, UMTX_OP_WAIT_UINT, val, 0, 0);
+}
+static inline void futex_wakeall(int *p) {
+	_umtx_op(p, UMTX_OP_WAKE, p, (1u << 31) - 1, 0, 0);
+}
+static inline void futex_wake1(int *p) {
+	_umtx_op(p, UMTX_OP_WAKE, p, 1, 0, 0);
+}
+
+#elif defined(__DragonFly__)
+
+#include <unistd.h>
+
+// An actually good interface. Thank you Matt, very cool.
+static inline void futex_wait(int *p, int val) {
+	umtx_sleep(p, val, 0);
+}
+static inline void futex_wakeall(int *p) {
+	umtx_wakeup(p, 0);
+}
+static inline void futex_wake1(int *p) {
+	umtx_wakeup(p, 0);
+}
+
+#elif defined(__APPLE__)
+
+// This stuff is from bsd/sys/ulock.h in XNU. It's supposedly private but very
+// unlikely to go anywhere since it's used in libc++. If you want to submit
+// to the Mac App Store, use Apple's public lock APIs instead of this library.
+extern int __ulock_wait(unsigned int op, void *addr, unsigned long long val,
+		unsigned int timeout);
+extern int __ulock_wake(unsigned int op, void *addr, unsigned long long val);
+
+#define UL_COMPARE_AND_WAIT 1
+#define ULF_WAKE_ALL 0x100
+#define ULF_NO_ERRNO 0x1000000
+
+static inline void futex_wait(int *p, int val) {
+	__ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, p, val, 0);
+}
+static inline void futex_wakeall(int *p) {
+	__ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, uaddr, 0);
+}
+static inline void futex_wake1(int *p) {
+	__ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, uaddr, 0);
+}
+
+#elif defined(_WIN32)
+
+#ifdef _WIN64
+typedef unsigned long long usize;
+#else
+typedef unsigned long usize;
+#endif
+
+// There's no header for these because NTAPI. Plus Windows.h sucks anyway.
+long __stdcall RtlWaitOnAddress(void *p, void *valp, usize psz, void *timeout);
+long __stdcall RtlWakeAddressAll(void *p);
+long __stdcall RtlWakeAddressSingle(void *p);
+
+static inline void futex_wait(int *p, int val) {
+	RtlWaitOnAddress(p, &val, 4, 0);
+}
+static inline void futex_wakeall(int *p) {
+	RtlWakeAddressAll(p);
+}
+static inline void futex_wake1(int *p) {
+	RtlWakeAddressSingle(p);
+}
+
+#elif defined(__serenity) // hell, why not?
+
+#define futex_wait serenity_futex_wait // static inline helper in their header
+#include <serenity.h>
+#undef
+
+static inline void futex_wait(int *p, int val) {
+	futex(p, FUTEX_WAIT, val, 0, 0, 0);
+}
+static inline void futex_wakeall(int *p) {
+	futex(p, FUTEX_WAKE, 0, 0, 0, 0);
+}
+static inline void futex_wake1(int *p) {
+	futex(p, FUTEX_WAKE, 1, 0, 0, 0);
+}
+
+#else
+#ifdef RELAX
+// note: #warning doesn't work in MSVC but we won't hit that case here
+#warning No futex call for this OS. Falling back on pure spinlock. \
+Performance will suffer during contention.
+#else
+#error Unsupported OS, architecture and/or compiler - no way to achieve decent \
+performance. Need either CPU spinlock hints or futexes, ideally both.
+#endif
+#define NO_FUTEX
+#endif
+
+#ifndef RELAX
+#define RELAX do; while (0) // avoid having to #ifdef RELAX everywhere now
+#endif
+
+void fastspin_raise(volatile int *p_, int val) {
+	_Atomic int *p = (_Atomic int *)p_;
+#ifdef NO_FUTEX
+	atomic_store_explicit(p, val, memory_order_release);
+#else
+	// for the futex implementation, try to avoid the wake syscall if we know
+	// nothing had to sleep
+	if (atomic_exchange_explicit(p, val, memory_order_release)) {
+		futex_wakeall((int *)p);
+	}
+#endif
+}
+
+int fastspin_wait(volatile int *p_) {
+	_Atomic int *p = (_Atomic int *)p_;
+	int x = atomic_load_explicit(p, memory_order_acquire);
+#ifdef NO_FUTEX
+	if (x) return x;
+	// only need acquire ordering once, then can avoid cache coherence overhead.
+	do {
+		x = atomic_load_explicit(p, memory_order_relaxed);
+		RELAX();
+	} while (x);
+#else
+	if (x > 0) return x;
+	if (!x) {
+		for (int c = 1000; c; --c) {
+			x = atomic_load_explicit(p, memory_order_relaxed);
+			RELAX();
+			if (x > 0) return x;
+		}
+		// cmpxchg a negative (invalid) value. this will fail in two cases:
+		// 1. someone else already cmpxchg'd: the futex_wait() will work fine
+		// 2. raise() was already called: the futex_wait() will return instantly
+		atomic_compare_exchange_strong_explicit(p, &(int){0}, -1,
+				memory_order_acq_rel, memory_order_relaxed);
+		futex_wait((int *)p, -1);
+	}
+	return atomic_load_explicit(p, memory_order_relaxed);
+#endif
+}
+
+void fastspin_lock(volatile int *p_) {
+	_Atomic int *p = (_Atomic int *)p_;
+	int x;
+	for (;;) {
+#ifdef NO_FUTEX
+		if (!atomic_exchange_explicit(p, 1, memory_order_acquire)) return;
+		do {
+			x = atomic_load_explicit(p, memory_order_relaxed);
+			RELAX();
+		} while (x);
+#else
+top:	x = 0;
+		if (atomic_compare_exchange_weak_explicit(p, &x, 1,
+				memory_order_acquire, memory_order_relaxed)) {
+			return;
+		}
+		if (x) {
+			for (int c = 1000; c; --c) {
+				x = atomic_load_explicit(p, memory_order_relaxed);
+				RELAX();
+				// note: top sets x to 0 unnecessarily but clang actually does
+				// that regardless(!), probably to break loop-carried dependency
+				if (!x) goto top;
+			}
+			atomic_compare_exchange_strong_explicit(p, &(int){0}, -1,
+					memory_order_acq_rel, memory_order_relaxed);
+			futex_wait((int *)p, -1); // (then spin once more to avoid spuria)
+		}
+#endif
+	}
+}
+
+void fastspin_unlock(volatile int *p_) {
+	_Atomic int *p = (_Atomic int *)p_;
+#ifdef NO_FUTEX
+	atomic_store_explicit((_Atomic int *)p, 0, memory_order_release);
+#else
+	if (atomic_exchange_explicit(p, 0, memory_order_release) < 0) {
+		futex_wake1((int *)p);
+	}
+#endif
+}
+
+// vi: sw=4 ts=4 noet tw=80 cc=80