diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/ac.c | 257 | ||||
-rw-r--r-- | src/bitbuf.h | 58 | ||||
-rw-r--r-- | src/build/cmeta.c | 15 | ||||
-rw-r--r-- | src/build/codegen.c | 13 | ||||
-rw-r--r-- | src/chunklets/README | 27 | ||||
-rw-r--r-- | src/chunklets/README-fastspin | 109 | ||||
-rw-r--r-- | src/chunklets/README-msg | 55 | ||||
-rw-r--r-- | src/chunklets/cacheline.h | 45 | ||||
-rw-r--r-- | src/chunklets/fastspin.c | 299 | ||||
-rw-r--r-- | src/chunklets/fastspin.h | 65 | ||||
-rw-r--r-- | src/chunklets/msg.c | 275 | ||||
-rw-r--r-- | src/chunklets/msg.h | 350 | ||||
-rw-r--r-- | src/crypto.c | 24 | ||||
-rw-r--r-- | src/crypto.h | 14 | ||||
-rw-r--r-- | src/democustom.c | 113 | ||||
-rw-r--r-- | src/democustom.h | 7 | ||||
-rw-r--r-- | src/demorec.c | 20 | ||||
-rw-r--r-- | src/demorec.h | 25 | ||||
-rw-r--r-- | src/engineapi.c | 2 | ||||
-rw-r--r-- | src/engineapi.h | 17 | ||||
-rw-r--r-- | src/os-win32.h | 4 | ||||
-rw-r--r-- | src/rinput.c | 2 | ||||
-rw-r--r-- | src/sst.c | 109 | ||||
-rw-r--r-- | src/sst.h | 7 |
24 files changed, 1700 insertions, 212 deletions
@@ -20,14 +20,21 @@ #include <immintrin.h> #endif +#include "alias.h" #include "bind.h" +#include "chunklets/fastspin.h" +#include "chunklets/msg.h" #include "con_.h" +#include "crypto.h" +#include "democustom.h" +#include "demorec.h" #include "hook.h" #include "engineapi.h" #include "errmsg.h" #include "event.h" #include "feature.h" #include "gamedata.h" +#include "gametype.h" #include "intdefs.h" #include "mem.h" #include "os.h" @@ -37,33 +44,91 @@ #include "x86.h" #include "x86util.h" +#ifdef _WIN32 +#include <werapi.h> // must be after Windows.h (via os.h) +#endif + FEATURE() REQUIRE(bind) REQUIRE(democustom) REQUIRE_GAMEDATA(vtidx_GetDesktopResolution) REQUIRE_GAMEDATA(vtidx_DispatchAllStoredGameMessages) +REQUIRE_GLOBAL(pluginhandler) + +static bool enabled = false; -static bool lockdown = false; +// mild overkill: 1 page of memory that won't be coredumped or swapped to disk +static struct keybox { + union { uchar prv[32], shr[32]; }; + uchar tmp[32], pub[32], lbpub[32]; // NOTE: these 3 must be kept contiguous! + union { u64 nonce; uchar nonce_bytes[8]; }; + crypto_rng_ctx rng; // NOTE: keep this at the end, for wipesessionkeys() +} *keybox; + +enum { + LBPK_L4D +}; +const uchar lbpubkeys[1][32] = { + // L4D series (PLACEHOLDERS for now; this whole thing is unfinished!) + 0x4A, 0xF3, 0xE2, 0xFC, 0x9C, 0x4E, 0xCB, 0xF9, + 0xBD, 0xB8, 0xA9, 0xFC, 0x0E, 0xF7, 0x93, 0x9C, + 0xC3, 0x09, 0x43, 0xB2, 0x6E, 0x7B, 0x1F, 0x19, + 0x40, 0x05, 0xE9, 0x60, 0x43, 0xE8, 0xE2, 0x03 +}; + +static void newsessionkeys(void) { + crypto_rng_read(&keybox->rng, keybox->prv, sizeof(keybox->prv)); + crypto_x25519_public_key(keybox->pub, keybox->prv); + crypto_x25519(keybox->tmp, keybox->prv, keybox->lbpub); + // dumbest, safest possible key derivation, because I'm not a cryptographer. + // future versions of the custom demo protocol COULD get something faster + // (like something with hchacha20, if only I could find enough info on that) + crypto_blake2b(keybox->shr, sizeof(keybox->tmp), keybox->tmp, 96); + crypto_wipe(keybox->tmp, sizeof(keybox->tmp)); + keybox->nonce = 0; +} + +static void wipesessionkeys(void) { + crypto_wipe(keybox->prv, offsetof(struct keybox, rng)); +} + +HANDLE_EVENT(DemoRecordStarting, void) { if (enabled) newsessionkeys(); } +HANDLE_EVENT(DemoRecordStopped, int ndemos) { if (enabled) wipesessionkeys(); } #ifdef _WIN32 static void *gamewin, *inhookwin, *inhookthr; static ulong inhooktid; -// UINT_PTR is a **stupid** typedef, but whatever. -static ssize __stdcall kproc(int code, UINT_PTR wp, ssize lp) { +static ssize __stdcall kproc(int code, usize wp, ssize lp) { KBDLLHOOKSTRUCT *data = (KBDLLHOOKSTRUCT *)lp; - if (lockdown && data->flags & LLKHF_INJECTED && + if (enabled && data->flags & LLKHF_INJECTED && GetForegroundWindow() == gamewin) { - return 1; + // maybe this input is reasonable, but log it for closer inspection + // TODO(rta): figure out what to do with this stuff + // something like the following, but with a proper abstraction... + //uchar buf[28 + 16], *p = buf; + //msg_putasz4(p, 2); p += 1; + // msg_putssz5(p, 8); memcpy(p + 1, "FakeKey", 7); p += 8; + // msg_putmsz4(p, 2); p += 1; + // msg_putssz5(p, 3); memcpy(p + 1, "vk", 2); p += 3; + // p += msg_putu32(p, data->vkCode); + // msg_putssz5(p, 3); memcpy(p + 1, "scan", 4); p += 5; + // p += msg_putu32(p, data->scanCode); + //++keybox->nonce; + //// append mac at end of message + //crypto_aead_lock_djb(buf, p, keybox->shr, keybox->nonce_bytes, 0, 0, + // buf, p - buf); + //democustom_write(buf, p - buf + 16); } return CallNextHookEx(0, code, wp, lp); } -static ssize __stdcall mproc(int code, UINT_PTR wp, ssize lp) { +static ssize __stdcall mproc(int code, usize wp, ssize lp) { MSLLHOOKSTRUCT *data = (MSLLHOOKSTRUCT *)lp; - if (lockdown && data->flags & LLMHF_INJECTED && + if (enabled && data->flags & LLMHF_INJECTED && GetForegroundWindow() == gamewin) { + // no way this input would ever be reasonable. just discard it return 1; } return CallNextHookEx(0, code, wp, lp); @@ -72,46 +137,48 @@ static ssize __stdcall mproc(int code, UINT_PTR wp, ssize lp) { // this is its own thread to meet the strict timing deadline, otherwise the // hook gets silently removed. plus, we don't wanna incur latency anyway. static ulong __stdcall inhookthrmain(void *param) { - volatile u32 *sig = param; - if (!SetWindowsHookExW(WH_KEYBOARD_LL, &kproc, 0, 0) || - !SetWindowsHookExW(WH_MOUSE_LL, &mproc, 0, 0)) { - *sig = 2; + volatile int *sig = param; + if (!SetWindowsHookExW(WH_KEYBOARD_LL, (HOOKPROC)&kproc, 0, 0) || + !SetWindowsHookExW(WH_MOUSE_LL, (HOOKPROC)&mproc, 0, 0)) { + fastspin_raise(sig, 2); return -1; } - *sig = 1; + fastspin_raise(sig, 1); MSG m; int ret; while ((ret = GetMessageW(&m, inhookwin, 0, 0)) > 0) DispatchMessage(&m); return ret; } -static WNDPROC orig_wndproc; -static ssize __stdcall hook_wndproc(void *wnd, uint msg, UINT_PTR wp, ssize lp) { - if (msg == WM_COPYDATA && lockdown) return DefWindowProcW(wnd, msg, wp, lp); - return orig_wndproc(wnd, msg, wp, lp); +static ssize orig_wndproc; +static ssize __stdcall hook_wndproc(void *wnd, uint msg, usize wp, ssize lp) { + if (msg == WM_COPYDATA && enabled) return DefWindowProcW(wnd, msg, wp, lp); + return CallWindowProcA((WNDPROC)orig_wndproc, wnd, msg, wp, lp); } static bool win32_init(void) { - gamewin = FindWindowW(L"Valve001", 0); + // note: using A instead of W to avoid some weirdness with handles... + gamewin = FindWindowA("Valve001", 0); // note: error messages here are a bit cryptic on purpose, but easy to find // in the code. in other words, we're hiding in plain sight :-) if (!gamewin) { errmsg_errorsys("failed to find window"); return false; } - orig_wndproc = (WNDPROC)SetWindowLongPtrW(gamewin, GWLP_WNDPROC, - (ssize)hook_wndproc); - if (!orig_wndproc) { + orig_wndproc = SetWindowLongPtrA(gamewin, GWLP_WNDPROC, + (ssize)&hook_wndproc); + if (!orig_wndproc) { // XXX: assuming 0 won't be legitimately returned errmsg_errorsys("failed to attach message handler"); + return false; } return true; } static void win32_end(void) { // no error handling here because we'd crash either way. good luck! - SetWindowLongW(gamewin, GWLP_WNDPROC, (ssize)orig_wndproc); + SetWindowLongPtrA(gamewin, GWLP_WNDPROC, orig_wndproc); } -static void inhook_start(volatile u32 *sig) { +static void inhook_start(volatile int *sig) { inhookwin = CreateWindowW(L"sst-eventloop", L"sst-eventloop", WS_DISABLED, 0, 0, 0, 0, HWND_MESSAGE, 0, 0, 0); inhookthr = CreateThread(0, 0, &inhookthrmain, (u32 *)sig, 0, &inhooktid); @@ -127,7 +194,7 @@ static void inhook_check(void) { // won't matter in practice but... this kind of sucks. con_warn("** sst: ERROR in message loop, abandoning RTA mode! **"); // TODO(rta): stop demos, and stuff. - lockdown = false; + enabled = false; } } } @@ -138,12 +205,14 @@ static void inhook_stop(void) { errmsg_warnsys("couldn't wait for thread, status unknown"); // XXX: now what!? } - // assume WAIT_OBJECT_0 - ulong status; - GetExitCodeThread(inhookthr, &status); - if (status) { - // not much else we can do now! - con_warn("warning: RTA mode message loop had an error during shutdown"); + else { + // assume WAIT_OBJECT_0 + ulong status; + GetExitCodeThread(inhookthr, &status); + if (status) { + // not much else we can do now! + errmsg_errorx("message loop didn't shut down cleanly\n"); + } } CloseHandle(inhookthr); } @@ -155,22 +224,18 @@ static void inhook_stop(void) { #endif bool ac_enable(void) { - if (lockdown) return true; + if (enabled) return true; #ifdef _WIN32 - // and now for some frivolously microoptimised spinlocking nonsense - volatile u32 sig = 0; // paranoid volatile to ensure no loop misopt... + volatile int sig = 0; inhook_start(&sig); - register u32 x; // avoid double-reading the volatile - // pausing in the middle here seems to produce shorter asm with gcc -O2 and - // clang -O3 in godbolt (avoids unrolling of head which is unlikely to help) - while (x = sig, _mm_pause(), !x); - if (x == 2) { // else 1 for success + fastspin_wait(&sig); + if (sig == 2) { // else 1 for success con_warn("** sst: ERROR starting message loop, can't continue! **"); CloseHandle(inhookthr); return false; } #endif - lockdown = true; + enabled = true; return true; } @@ -178,16 +243,16 @@ HANDLE_EVENT(Tick, bool simulating) { #ifdef _WIN32 static uint fewticks = 0; // just check this every so often (roughly 0.1-0.3s depending on game) - if (lockdown && !(++fewticks & 7)) inhook_check(); + if (enabled && !(++fewticks & 7)) inhook_check(); #endif } void ac_disable(void) { - if (!lockdown) return; + if (!enabled) return; #ifdef _WIN32 inhook_stop(); #endif - lockdown = false; + enabled = false; } enum /* from InputEventType_t - terser names used here */ { @@ -216,15 +281,22 @@ typedef void (*VCALLCONV DispatchInputEvent_func)(void *, struct inputevent *); static DispatchInputEvent_func orig_DispatchInputEvent; static void VCALLCONV hook_DispatchInputEvent(void *this, struct inputevent *ev) { - // TODO(rta): do something here! (here's a quick reference/example) - //switch (ev->type) { - // CASES(BTNDOWN, BTNUP, BTNDOUBLECLICK): - // const char *desc[] = {"DOWN", "UP", "DOUBLE"}; - // const char *binding = bind_get(ev->data); - // if (!binding) binding = "[unbound]"; - // con_msg("key %d %s => %s\n", ev->data, desc[ev->type - BTNDOWN], - // binding); - //} + //const char *desc[] = {"DOWN", "UP", "DBL"}; + //const char desclen[] = {4, 2, 3}; + switch (ev->type) { + CASES(BTNDOWN, BTNUP, BTNDOUBLECLICK):; + // TODO(rta): do something interesting with button data + //uchar buf[28], *p = buf; + //msg_putasz4(p, 2); p += 1; + // msg_putssz5(p, 8); memcpy(p + 1, "KeyInput", 8); p += 9; + // msg_putmsz4(p, 2); p += 1; + // msg_putssz5(p, 3); memcpy(p + 1, "key", 3); p += 4; + // p += msg_puts32(p, ev->data); + // msg_putssz5(p, 3); memcpy(p + 1, "btn", 3); p += 4; + // int idx = ev->type - BTNDOWN; + // msg_putssz5(p++, desclen[idx]); + // memcpy(p, desc[idx], desclen[idx]); p += desclen[idx]; + } orig_DispatchInputEvent(this, ev); } @@ -242,7 +314,7 @@ static bool find_DispatchInputEvent(void) { return false; } void *cgame; - const uchar *insns = (const uchar*)VFUNC(gameuifuncs, GetDesktopResolution); + const uchar *insns = (const uchar *)VFUNC(gameuifuncs, GetDesktopResolution); for (const uchar *p = insns; p - insns < 16;) { if (p[0] == X86_MOVRMW && p[1] == X86_MODRM(0, 1, 5)) { void **indirect = mem_loadptr(p + 2); @@ -273,12 +345,27 @@ ok: insns = (const uchar *)VFUNC(cgame, DispatchAllStoredGameMessages); return false; } +HANDLE_EVENT(AllowPluginLoading, bool loading) { + if (enabled && demorec_demonum() != -1) { + con_warn("sst: plugins cannot be %s while recording a run\n", + loading ? "loaded" : "unloaded"); + return false; + } + return true; +} + +HANDLE_EVENT(PluginLoaded, void) { + // TODO(rta): do something with plugin list here +} +HANDLE_EVENT(PluginUnloaded, void) { + // TODO(rta): do something with plugin list here +} + +PREINIT { + return GAMETYPE_MATCHES(L4D); // TODO(compat): add more here obviously +} + INIT { -#if defined(_WIN32) - if (!win32_init()) return false; -#elif defined(__linux__) - // TODO(linux): call init things -#endif if (!find_DispatchInputEvent()) return false; orig_DispatchInputEvent = (DispatchInputEvent_func)hook_inline( (void *)orig_DispatchInputEvent, (void *)&hook_DispatchInputEvent); @@ -286,16 +373,70 @@ INIT { errmsg_errorsys("couldn't hook DispatchInputEvent function"); return false; } + +#ifdef _WIN32 + keybox = VirtualAlloc(0, 4096, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!keybox) { + errmsg_errorsys("couldn't allocate memory for session state"); + return false; + } + if (!VirtualLock(keybox, 4096)) { + errmsg_errorsys("couldn't secure session state"); + goto e2; + } + if (WerRegisterExcludedMemoryBlock(keybox, 4096) != S_OK) { + // FIXME: stringify errors properly here + errmsg_errorx("couldn't secure session state"); + goto e2; + } + if (!win32_init()) goto e; +#else + keybox = mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + if (keybox == MAP_FAILED) { + errmsg_errorstd("couldn't allocate memory for session state"); + return false; + } + // linux-specific madvise stuff (there are some equivalents in OpenBSD and + // FreeBSD, if anyone's wondering, but we don't need to worry about those) + if (madvise(keybox, 4096, MADV_DONTFORK) == -1 || + madvise(keybox, 4096, MADV_DONTDUMP) == - 1 || + mlock(keybox, 4096) == -1) { + errormsg_errorstd("couldn't secure session state"); + goto e; + } + // TODO(linux): call other init things +#endif + + uchar seed[32]; + os_randombytes(seed, sizeof(seed)); + crypto_rng_init(&keybox->rng, seed); + if (GAMETYPE_MATCHES(L4D)) { + // copy into the keybox so key derivation blake2 gets a nice contiguous + // run of bytes + memcpy(keybox->lbpub, lbpubkeys[LBPK_L4D], 32); + } return true; + +#ifdef _WIN32 +e: WerUnregisterExcludedMemoryBlock(keybox); // this'd better not fail! +e2: VirtualFree(keybox, 4096, MEM_RELEASE); +#elif +e: munmap(keybox, 4096); +#endif + unhook_inline((void *)orig_DispatchInputEvent); + return false; } END { + ac_disable(); #if defined(_WIN32) + WerUnregisterExcludedMemoryBlock(keybox); // this'd better not fail! + VirtualFree(keybox, 4096, MEM_RELEASE); win32_end(); #elif defined(__linux__) - // TODO(linux): call cleanup things + munmap(keybox, 4096); + // TODO(linux): call other cleanup things #endif - ac_disable(); unhook_inline((void *)orig_DispatchInputEvent); } diff --git a/src/bitbuf.h b/src/bitbuf.h index a2ee60f..404dc9d 100644 --- a/src/bitbuf.h +++ b/src/bitbuf.h @@ -19,47 +19,50 @@ #include "intdefs.h" -// NOTE: This code assumes it's running on a little endian machine, because, -// well, the game runs on a little endian machine. -// *technically* this could break unit tests in a contrived cross-compile -// scenario? right now none of the tests care about actual bit values, and we -// don't cross compile, so this won't matter till later. :) +// NOTE: This code is not big-endian-safe, because the game itself is little- +// endian. This could theoretically break tests in odd cross-compile scenarios, +// but no tests currently look at actual bit values so it's fine for now. -// handle 8 bytes at a time (COULD do 16 with SSE, but who cares this is fine) -typedef uvlong bitbuf_cell; +// handle one machine word at a time (SIMD is probably not worth it... yet?) +typedef usize bitbuf_cell; static const int bitbuf_cell_bits = sizeof(bitbuf_cell) * 8; static const int bitbuf_align = _Alignof(bitbuf_cell); /* A bit buffer, ABI-compatible with bf_write defined in tier1/bitbuf.h */ struct bitbuf { union { - char *buf; /* NOTE: the buffer MUST be aligned as bitbuf_cell! */ - bitbuf_cell *buf_as_cells; + char *buf; /* NOTE: the buffer SHOULD be aligned as bitbuf_cell! */ + bitbuf_cell *cells; }; - int sz, nbits, curbit; + int sz, nbits; + uint curbit; // made unsigned so divisions can become shifts (hopefully...) bool overflow, assert_on_overflow; const char *debugname; }; -/* Append a value to the bitbuffer, with a specfied length in bits. */ -static inline void bitbuf_appendbits(struct bitbuf *bb, bitbuf_cell x, - int nbits) { +// detail: need a cell internally, but API users shouldn't rely on 64-bit size +static inline void _bitbuf_append(struct bitbuf *bb, bitbuf_cell x, int nbits) { int idx = bb->curbit / bitbuf_cell_bits; int shift = bb->curbit % bitbuf_cell_bits; // OR into the existing cell (lower bits were already set!) - bb->buf_as_cells[idx] |= x << shift; + bb->cells[idx] |= x << shift; // assign the next cell (that also clears the upper bits for the next OR) // if nbits fits in the first cell, this zeros the next cell, which is fine - bb->buf_as_cells[idx + 1] = x >> (bitbuf_cell_bits - shift); + bb->cells[idx + 1] = x >> (bitbuf_cell_bits - shift); bb->curbit += nbits; } -/* Append a byte to the bitbuffer - same as appendbits(8) but more convenient */ +/* Appends a value to the bit buffer, with a specfied length in bits. */ +static inline void bitbuf_appendbits(struct bitbuf *bb, uint x, int nbits) { + _bitbuf_append(bb, x, nbits); +} + +/* Appends a byte to the bit buffer. */ static inline void bitbuf_appendbyte(struct bitbuf *bb, uchar x) { - bitbuf_appendbits(bb, x, 8); + _bitbuf_append(bb, x, 8); } -/* Append a sequence of bytes to the bitbuffer, with length given in bytes */ +/* Appends a sequence of bytes to the bit buffer, with length given in bytes. */ static inline void bitbuf_appendbuf(struct bitbuf *bb, const char *buf, uint len) { // NOTE! This function takes advantage of the fact that nothing unaligned @@ -67,25 +70,30 @@ static inline void bitbuf_appendbuf(struct bitbuf *bb, const char *buf, // segfault. This is absolutely definitely technically UB, but it's unit // tested and apparently works in practice. If something weird happens // further down the line, sorry! - usize unalign = (usize)buf % bitbuf_align; + usize unalign = (usize)buf & (bitbuf_align - 1); if (unalign) { // round down the pointer - bitbuf_cell *p = (bitbuf_cell *)((usize)buf & ~(bitbuf_align - 1)); + bitbuf_cell *p = (bitbuf_cell *)((usize)buf - unalign); // shift the stored value (if it were big endian, the shift would have // to be the other way, or something) - bitbuf_appendbits(bb, *p >> (unalign * 8), (bitbuf_align - unalign) * 8); + _bitbuf_append(bb, *p >> (unalign << 3), (bitbuf_align - unalign) << 3); buf += sizeof(bitbuf_cell) - unalign; len -= unalign; } bitbuf_cell *aligned = (bitbuf_cell *)buf; - for (; len > sizeof(bitbuf_cell); len -= sizeof(bitbuf_cell), ++aligned) { - bitbuf_appendbits(bb, *aligned, bitbuf_cell_bits); + for (; len >= sizeof(bitbuf_cell); len -= sizeof(bitbuf_cell), ++aligned) { + _bitbuf_append(bb, *aligned, bitbuf_cell_bits); } // unaligned end bytes - bitbuf_appendbits(bb, *aligned, len * 8); + _bitbuf_append(bb, *aligned, len << 3); +} + +/* 0-pad the bit buffer up to the next whole byte boundary. */ +static inline void bitbuf_roundup(struct bitbuf *bb) { + bb->curbit += -(uint)bb->curbit & 7; } -/* Clear the bitbuffer to make it ready to append new data */ +/* Clear the bit buffer to make it ready to append new data. */ static inline void bitbuf_reset(struct bitbuf *bb) { bb->buf[0] = 0; // we have to zero out the lowest cell since it gets ORed bb->curbit = 0; diff --git a/src/build/cmeta.c b/src/build/cmeta.c index 7f314c7..40aba3a 100644 --- a/src/build/cmeta.c +++ b/src/build/cmeta.c @@ -1,5 +1,5 @@ /* - * Copyright © 2022 Michael Smith <mikesmiffy128@gmail.com> + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -111,15 +111,7 @@ static void die2(const char *s1, const char *s2) { static char *readsource(const os_char *f) { int fd = os_open(f, O_RDONLY); -#ifndef _WIN32 - if (fd == -1) die2("couldn't open ", f); -#else - // XXX: this is dumb and bad - if (fd == -1) { - fprintf(stderr, "cmeta: fatal: couldn't open %S", f); - exit(100); - } -#endif + if (fd == -1) return 0; uint bufsz = 8192; char *buf = malloc(bufsz); if (!buf) die1("couldn't allocate memory"); @@ -146,6 +138,7 @@ struct cmeta; const struct cmeta *cmeta_loadfile(const os_char *f) { char *buf = readsource(f); + if (!buf) return 0; #ifdef _WIN32 char *realname = malloc(wcslen(f) + 1); if (!realname) die1("couldn't allocate memory"); @@ -163,7 +156,7 @@ const struct cmeta *cmeta_loadfile(const os_char *f) { // NOTE: we don't care about conditional includes, nor do we expand macros. We // just parse the minimum info to get what we need for SST. Also, there's not // too much in the way of syntax checking; if an error gets ignored the compiler -// picks it anyway, and gives far better diagnostics. +// picks it up anyway, and gives far better diagnostics. void cmeta_includes(const struct cmeta *cm, void (*cb)(const char *f, bool issys, void *ctxt), void *ctxt) { const Token *tp = (const Token *)cm; diff --git a/src/build/codegen.c b/src/build/codegen.c index e24a096..bb25395 100644 --- a/src/build/codegen.c +++ b/src/build/codegen.c @@ -24,8 +24,14 @@ #include "skiplist.h" #include "vec.h" +#ifdef _WIN32 +#define fS "S" +#else +#define fS "s" +#endif + static void die(const char *s) { - fprintf(stderr, "codegen: %s\n", s); + fprintf(stderr, "codegen: fatal: %s\n", s); exit(100); } @@ -275,6 +281,11 @@ F( " has_%s = status_%s == FEAT_OK;", f->modname, f->modname) int OS_MAIN(int argc, os_char *argv[]) { for (++argv; *argv; ++argv) { const struct cmeta *cm = cmeta_loadfile(*argv); + if (!cm) { + fprintf(stderr, "codegen: fatal: couldn't load file %" fS "\n", + *argv); + exit(100); + } cmeta_conmacros(cm, &oncondef); cmeta_evdefmacros(cm, &onevdef); if (!vec_push(&pass2, ((struct passinfo){cm, *argv}))) { diff --git a/src/chunklets/README b/src/chunklets/README new file mode 100644 index 0000000..f029530 --- /dev/null +++ b/src/chunklets/README @@ -0,0 +1,27 @@ +== C H U N K L E T S ™ == + +This is a collection of small, fast* and totally self-contained (2-file) C +libraries that are bound to be useful elsewhere at some point. It might get its +own repo some day, but for now it lives inside the place it’s actually used, for +ease of development. Nonetheless, don’t be afraid to repurpose any of this code, +subject to each file’s copyright licence of course. + +Each .{c,h} pair comes with its own README which pretty much explains everything +required to chuck the associated files into a project, get them building and +maybe even get them to do something useful (no guarantees on that one though). + +* well, hopefully fast. + +- Why is it called Chunklets? - + +> “Chunklets” is a unique and memorable name for your set of {.c, .h} pairs. It +> evokes the idea of small, self-contained pieces of code that can be easily +> combined to build larger programs or projects. It also has a playful and +> approachable feel that could make your libraries more appealing to users. +> Overall, it’s a great choice for a name! + +Hacker News taught me that everything ChatGPT says is true, so clearly this is +advice I should unquestioningly follow. + +Thanks, and have fun! +- Michael Smith <mikesmiffy128@gmail.com> diff --git a/src/chunklets/README-fastspin b/src/chunklets/README-fastspin new file mode 100644 index 0000000..8052415 --- /dev/null +++ b/src/chunklets/README-fastspin @@ -0,0 +1,109 @@ +fastspin.{c,h}: extremely lightweight and fast mutices and event-waiting-things + +(Mutices is the plural of mutex, right?) + +== Compiling == + + gcc -c -O2 [-flto] fastspin.c + clang -c -O2 [-flto] fastspin.c + tcc -c fastspin.c + cl.exe /c /O2 /std:c17 /experimental:c11atomics fastspin.c + +In most cases you can just drop the .c file straight into your codebase/build +system. LTO is advised to avoid dead code and enable more efficient calls +including potential inlining. + +NOTE: On Windows, it is necessary to link with ntdll.lib. + +== Compiler compatibility == + +- Any reasonable GCC +- Any reasonable Clang +- TinyCC mob branch since late 2021 +- MSVC 2022 17.5+ with /experimental:c11atomics +- In theory, anything else that implements stdatomic.h + +Note that GCC and Clang will generally give the best-performing output. + +Once the .c file is built, the public header can be consumed by virtually any C +or C++ compiler, as well as probably most half-decent FFIs. + +Note that the .c source file is not C++-compatible, only the header is. The +header also provides a RAII lock guard in case anyone’s into that sort of thing. + +== API usage == + +See documentation comments in fastspin.h for a basic idea. Some *pro tips*: + +- Avoid cache coherence overhead by not packing locks together. Ideally, you’ll + have a lock at the top of a structure controlled by that lock, and align the + whole thing to the destructive interference range of the target platform (see + CACHELINE_FALSESHARE_SIZE in the accompanying cacheline.h). + +- Avoid putting more than one lock in a cache line. Ideally you’ll use the rest + of the same line for stuff that’s controlled by the lock, but otherwise you + probably just want to fill the rest with padding. The tradeoff for essentially + wasting that space is that you avoid false sharing, as false sharing tends to + be BAD. + +- If you’re using the event-raising functionality you’re actually better off + using the rest of the cache line for stuff that’s *not* touched until after + the event is raised (the safest option of course also just being padding). + +- You should actually measure this stuff, I dunno man. + +Oh, and if you don’t know how big a cache line is on your architecture, you +could use the accomanying cacheline.h to get some reasonable guesses. Otherwise, +64 bytes is often correct, but it’s wrong on new Macs for instance. + +== OS compatibility == + +First-class: +- Linux 2.6+ (glibc or musl) +- FreeBSD 11+ +- OpenBSD 6.2+ +- NetBSD ~9.1+ +- DragonFly 1.1+ +- Windows 8+ (only tested on 10+) +- macOS/Darwin since ~2016(?) (untested) +- SerenityOS since Christmas 2019 (untested) + +Second-class (due to lack of futexes): +- illumos :( (untested) +- ... others? + +* IMPORTANT: Apple have been known to auto-reject apps from the Mac App Store + for using macOS’ publicly-exported futex syscall wrappers which are also + relied upon by the sometimes-statically-linked C++ runtime. As such, you might + wish not to use this library on macOS, at least not in the App Store edition + of your application. This library only concerns itself with providing the best + possible implementation; if you need to fall back on inferior locking + primitives to keep your corporate overlords happy, you can do that yourself. + +== Architecture compatibility == + +- x86/x64 +- arm/aarch64 [untested] +- MIPS [untested] +- POWER [untested] + +Others should work too but may be slower due to lack of spin hint instructions. +Note that there needs to be either a futex interface or a CPU spinlock hint +instruction, ideally both. Otherwise performance will be simply no good during +contention. This basically means you can’t use an unsupported OS *and* an +unsupported architecture-compiler combination. + +== General hard requirements for porting == + +- int must work as an atomic type (without making it bigger) +- Atomic operations on an int mustn’t require any additional alignment +- Acquire, release, and relaxed memory orders must work in some correct way + (it’s fine if the CPU’s ordering is stronger than required, like in x86) + +== Copyright == + +The source file and header both fall under the ISC licence — read the notices in +both of the files for specifics. + +Thanks, and have fun! +- Michael Smith <mikesmiffy128@gmail.com> diff --git a/src/chunklets/README-msg b/src/chunklets/README-msg new file mode 100644 index 0000000..53d19f1 --- /dev/null +++ b/src/chunklets/README-msg @@ -0,0 +1,55 @@ +msg.{c,h}: fast low-level msgpack encoding + +== Compiling == + + gcc -c -O2 [-flto] msg.c + clang -c -O2 [-flto] msg.c + tcc -c msg.c + cl.exe /c /O2 msg.c + +In most cases you can just drop the .c file straight into your codebase/build +system. LTO is advised to avoid dead code and enable more efficient calls +including potential inlining. + +== Compiler compatibility == + +- Any reasonable GCC +- Any reasonable Clang +- Any reasonable MSVC +- TinyCC +- Probably almost all others; this is very portable code + +Note that GCC and Clang will generally give the best-performing output. + +Once the .c file is built, the public header can be consumed by virtually any C +or C++ compiler, as well as probably most half-decent FFIs. + +Note that the .c source file is not C++-compatible, only the header is. The +source file relies on union type-punning, which is well-defined in C but +undefined behaviour in C++. + +== API Usage == + +See documentation comments in msg.h for a basic idea. Note that this library is +very low-level and probably best suited use with some sort of metaprogramming/ +code-generation, or bindings to a higher-level langauge. + +== OS Compatibility == + +- All. +- Seriously, this library doesn’t even use libc. + +== Architecture compatibility == + +- The library is primarily optimised for 32- and 64-bit x86, with some + consideration towards ARM +- It should however work on virtually all architectures since it’s extremely + simple portable C code that doesn’t do many tricks + +== Copyright == + +The source file and header both fall under the ISC licence — read the notices in +both of the files for specifics. + +Thanks, and have fun! +- Michael Smith <mikesmiffy128@gmail.com> diff --git a/src/chunklets/cacheline.h b/src/chunklets/cacheline.h new file mode 100644 index 0000000..cadd55d --- /dev/null +++ b/src/chunklets/cacheline.h @@ -0,0 +1,45 @@ +/* This file is dedicated to the public domain. */ + +#ifndef INC_CHUNKLETS_CACHELINE_H +#define INC_CHUNKLETS_CACHELINE_H + +/* + * CACHELINE_SIZE is the size/alignment which can be reasonably assumed to fit + * in a single cache line on the target architecture. Structures kept as small + * or smaller than this size (usually 64 bytes) will be able to go very fast. + */ +#ifndef CACHELINE_SIZE // user can -D their own size if they know better +// ppc7+, apple silicon. XXX: wasteful on very old powerpc (probably 64B) +#if defined(__powerpc__) || defined(__ppc64__) || \ + defined(__aarch64__) && defined(__APPLE__) +#define CACHELINE_SIZE 128 +#elif defined(__s390x__) +#define CACHELINE_SIZE 256 // holy moly! +#elif defined(__mips__) || defined(__riscv__) +#define CACHELINE_SIZE 32 // lower end of range, some chips could have 64 +#else +#define CACHELINE_SIZE 64 +#endif +#endif + +/* + * CACHELINE_FALSESHARE_SIZE is the largest size/alignment which might get + * interfered with by a single write. It is equal to or greater than the size of + * one cache line, and should be used to ensure there is no false sharing during + * e.g. lock contention, or atomic fetch-increments on queue indices. + */ +#ifndef CACHELINE_FALSESHARE_SIZE +// modern intel CPUs sometimes false-share *pairs* of cache lines +#if defined(__i386__) || defined(__x86_64__) || defined(_M_X86) || \ + defined(_M_IX86) +#define CACHELINE_FALSESHARE_SIZE (CACHELINE_SIZE * 2) +#elif CACHELINE_SIZE < 64 +#define CACHELINE_FALSESHARE_SIZE 64 // be paranoid on mips and riscv +#else +#define CACHELINE_FALSESHARE_SIZE CACHELINE_SIZE +#endif +#endif + +#endif + +// vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/chunklets/fastspin.c b/src/chunklets/fastspin.c new file mode 100644 index 0000000..bfaaf9b --- /dev/null +++ b/src/chunklets/fastspin.c @@ -0,0 +1,299 @@ +/* + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __cplusplus +#error This file should not be compiled as C++. It relies on C-specific \ +keywords and APIs which have syntactically different equivalents for C++. +#endif + +#include <stdatomic.h> + +#include "fastspin.h" + +_Static_assert(sizeof(int) == sizeof(_Atomic int), + "This library assumes that ints in memory can be treated as atomic"); +_Static_assert(_Alignof(int) == _Alignof(_Atomic int), + "This library assumes that atomic operations do not need over-alignment"); + +#if defined(__GNUC__) || defined(__clang__) || defined(__TINYC__) +#if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) || \ + defined(__mips__) // same asm syntax for pause +#define RELAX() __asm__ volatile ("pause" ::: "memory") +#elif defined(__arm__) || defined(__aarch64__) +#define RELAX() __asm__ volatile ("yield" ::: "memory") +#elif defined(__powerpc__) || defined(__ppc64__) +// POWER7 (2010) - older arches may be less efficient +#define RELAX() __asm__ volatile ("or 27, 27, 27" ::: "memory") +#endif +#elif defined(_MSC_VER) +#if defined(_M_ARM || _M_ARM64) +#define RELAX() __yield() +#else +void _mm_pause(); // don't pull in emmintrin.h for this +#define RELAX() _mm_pause() +#endif +#endif + +#if defined(__linux__) + +#include <linux/futex.h> +#include <sys/syscall.h> + +// some arches only have a _time64 variant. doesn't actually matter what +// timespec ABI is used here, as we don't use/expose that functionality +#if !defined(SYS_futex) && defined( SYS_futex_time64) +#define SYS_futex SYS_futex_time64 +#endif + +// glibc and musl have never managed and/or bothered to provide a futex wrapper +static inline void futex_wait(int *p, int val) { + syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); +} +static inline void futex_wakeall(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); +} + +#elif defined(__OpenBSD__) + +#include <sys/futex.h> + +// OpenBSD just emulates the Linux call but it still provides a wrapper! Yay! +static inline void futex_wait(int *p, int val) { + futex(p, FUTEX_WAIT, val, (void *)0, (void *)0, 0); +} +static inline void futex_wakeall(int *p) { + futex(p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0); +} + +#elif defined(__NetBSD__) + +#include <sys/futex.h> // for constants +#include <sys/syscall.h> +#include <unistd.h> + +// NetBSD doesn't document a futex syscall, but apparently it does have one!? +// Their own pthreads library still doesn't actually use it, go figure. Also, it +// takes an extra parameter for some reason. +static inline void futex_wait(int *p, int val) { + syscall(SYS_futex, p, FUTEX_WAIT, val, (void *)0, (void *)0, 0, 0); +} +static inline void futex_wakeall(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, (1u << 31) - 1, (void *)0, (void *)0, 0, 0); +} +static inline void futex_wake1(int *p) { + syscall(SYS_futex, p, FUTEX_WAKE, 1, (void *)0, (void *)0, 0, 0); +} + +#elif defined(__FreeBSD__) + +#include <sys/types.h> // ugh still no IWYU everywhere. maybe next year +#include <sys/umtx.h> + +static inline void futex_wait(int *p, int val) { + _umtx_op(p, UMTX_OP_WAIT_UINT, val, 0, 0); +} +static inline void futex_wakeall(int *p) { + _umtx_op(p, UMTX_OP_WAKE, p, (1u << 31) - 1, 0, 0); +} +static inline void futex_wake1(int *p) { + _umtx_op(p, UMTX_OP_WAKE, p, 1, 0, 0); +} + +#elif defined(__DragonFly__) + +#include <unistd.h> + +// An actually good interface. Thank you Matt, very cool. +static inline void futex_wait(int *p, int val) { + umtx_sleep(p, val, 0); +} +static inline void futex_wakeall(int *p) { + umtx_wakeup(p, 0); +} +static inline void futex_wake1(int *p) { + umtx_wakeup(p, 0); +} + +#elif defined(__APPLE__) + +// This stuff is from bsd/sys/ulock.h in XNU. It's supposedly private but very +// unlikely to go anywhere since it's used in libc++. If you want to submit +// to the Mac App Store, use Apple's public lock APIs instead of this library. +extern int __ulock_wait(unsigned int op, void *addr, unsigned long long val, + unsigned int timeout); +extern int __ulock_wake(unsigned int op, void *addr, unsigned long long val); + +#define UL_COMPARE_AND_WAIT 1 +#define ULF_WAKE_ALL 0x100 +#define ULF_NO_ERRNO 0x1000000 + +static inline void futex_wait(int *p, int val) { + __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, p, val, 0); +} +static inline void futex_wakeall(int *p) { + __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, uaddr, 0); +} +static inline void futex_wake1(int *p) { + __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, uaddr, 0); +} + +#elif defined(_WIN32) + +#ifdef _WIN64 +typedef unsigned long long usize; +#else +typedef unsigned long usize; +#endif + +// There's no header for these because NTAPI. Plus Windows.h sucks anyway. +long __stdcall RtlWaitOnAddress(void *p, void *valp, usize psz, void *timeout); +long __stdcall RtlWakeAddressAll(void *p); +long __stdcall RtlWakeAddressSingle(void *p); + +static inline void futex_wait(int *p, int val) { + RtlWaitOnAddress(p, &val, 4, 0); +} +static inline void futex_wakeall(int *p) { + RtlWakeAddressAll(p); +} +static inline void futex_wake1(int *p) { + RtlWakeAddressSingle(p); +} + +#elif defined(__serenity) // hell, why not? + +#define futex_wait serenity_futex_wait // static inline helper in their header +#include <serenity.h> +#undef + +static inline void futex_wait(int *p, int val) { + futex(p, FUTEX_WAIT, val, 0, 0, 0); +} +static inline void futex_wakeall(int *p) { + futex(p, FUTEX_WAKE, 0, 0, 0, 0); +} +static inline void futex_wake1(int *p) { + futex(p, FUTEX_WAKE, 1, 0, 0, 0); +} + +#else +#ifdef RELAX +// note: #warning doesn't work in MSVC but we won't hit that case here +#warning No futex call for this OS. Falling back on pure spinlock. \ +Performance will suffer during contention. +#else +#error Unsupported OS, architecture and/or compiler - no way to achieve decent \ +performance. Need either CPU spinlock hints or futexes, ideally both. +#endif +#define NO_FUTEX +#endif + +#ifndef RELAX +#define RELAX do; while (0) // avoid having to #ifdef RELAX everywhere now +#endif + +void fastspin_raise(volatile int *p_, int val) { + _Atomic int *p = (_Atomic int *)p_; +#ifdef NO_FUTEX + atomic_store_explicit(p, val, memory_order_release); +#else + // for the futex implementation, try to avoid the wake syscall if we know + // nothing had to sleep + if (atomic_exchange_explicit(p, val, memory_order_release)) { + futex_wakeall((int *)p); + } +#endif +} + +int fastspin_wait(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; + int x = atomic_load_explicit(p, memory_order_acquire); +#ifdef NO_FUTEX + if (x) return x; + // only need acquire ordering once, then can avoid cache coherence overhead. + do { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + } while (x); +#else + if (x > 0) return x; + if (!x) { + for (int c = 1000; c; --c) { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + if (x > 0) return x; + } + // cmpxchg a negative (invalid) value. this will fail in two cases: + // 1. someone else already cmpxchg'd: the futex_wait() will work fine + // 2. raise() was already called: the futex_wait() will return instantly + atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, + memory_order_acq_rel, memory_order_relaxed); + futex_wait((int *)p, -1); + } + return atomic_load_explicit(p, memory_order_relaxed); +#endif +} + +void fastspin_lock(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; + int x; + for (;;) { +#ifdef NO_FUTEX + if (!atomic_exchange_explicit(p, 1, memory_order_acquire)) return; + do { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + } while (x); +#else +top: x = 0; + if (atomic_compare_exchange_weak_explicit(p, &x, 1, + memory_order_acquire, memory_order_relaxed)) { + return; + } + if (x) { + for (int c = 1000; c; --c) { + x = atomic_load_explicit(p, memory_order_relaxed); + RELAX(); + // note: top sets x to 0 unnecessarily but clang actually does + // that regardless(!), probably to break loop-carried dependency + if (!x) goto top; + } + atomic_compare_exchange_strong_explicit(p, &(int){0}, -1, + memory_order_acq_rel, memory_order_relaxed); + futex_wait((int *)p, -1); // (then spin once more to avoid spuria) + } +#endif + } +} + +void fastspin_unlock(volatile int *p_) { + _Atomic int *p = (_Atomic int *)p_; +#ifdef NO_FUTEX + atomic_store_explicit((_Atomic int *)p, 0, memory_order_release); +#else + if (atomic_exchange_explicit(p, 0, memory_order_release) < 0) { + futex_wake1((int *)p); + } +#endif +} + +// vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/chunklets/fastspin.h b/src/chunklets/fastspin.h new file mode 100644 index 0000000..6c0c5f7 --- /dev/null +++ b/src/chunklets/fastspin.h @@ -0,0 +1,65 @@ +/* + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef INC_CHUNKLETS_FASTSPIN_H +#define INC_CHUNKLETS_FASTSPIN_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Raises an event through p to 0 or more callers of fastspin_wait(). + * val must be positive, and can be used to signal a specific condition. + */ +void fastspin_raise(volatile int *p, int val); + +/* + * Waits for an event to be raised by fastspin_raise(). Allows this and possibly + * some other threads to wait for one other thread to signal its status. + * + * Returns the positive value that was passed to fastspin_raise(). + */ +int fastspin_wait(volatile int *p); + +/* + * Takes a mutual exclusion, i.e. a lock. *p must be initialised to 0 before + * anything starts using it as a lock. + */ +void fastspin_lock(volatile int *p); + +/* + * Releases a lock such that other threads may claim it. Immediately as a lock + * is released, its value will be 0, as though it had just been initialised. + */ +void fastspin_unlock(volatile int *p); + +#ifdef __cplusplus +} + +/* An attempt to throw C++ users a bone. Should be self-explanatory. */ +struct fastspin_lock_guard { + fastspin_lock_guard(volatile int &i): _p(&i) { fastspin_lock(_p); } + fastspin_lock_guard() = delete; + ~fastspin_lock_guard() { fastspin_unlock(_p); } + volatile int *_p; +}; + +#endif + +#endif + +// vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/chunklets/msg.c b/src/chunklets/msg.c new file mode 100644 index 0000000..0e26a80 --- /dev/null +++ b/src/chunklets/msg.c @@ -0,0 +1,275 @@ +/* + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __cplusplus +#error This file should not be compiled as C++. It relies on C-specific union \ +behaviour which is undefined in C++. +#endif + +// _Static_assert needs MSVC >= 2019, and this check is irrelevant on Windows +#ifndef _MSC_VER +_Static_assert( + (unsigned char)-1 == 255 && + sizeof(short) == 2 && + sizeof(int) == 4 && + sizeof(long long) == 8 && + sizeof(float) == 4 && + sizeof(double) == 8, + "this code is only designed for relatively sane environments, plus Windows" +); +#endif + +// -- A note on performance hackery -- +// +// Clang won't emit byte-swapping instructions in place of bytewise array writes +// unless nothing else is written to the same array. MSVC won't do it at all. +// For these compilers on little-endian platforms that can also do unaligned +// writes efficiently, we do so explicitly and handle the byte-swapping +// manually, which then tends to get optimised pretty well. +// +// GCC, somewhat surprisingly, seems to be much better at optimising the naïve +// version of the code, so we don't try to do anything clever there. Also, for +// unknown, untested compilers and/or platforms, we stick to the safe approach. +#if defined(_MSC_VER) || defined(__clang__) && (defined(__x86_64__) || \ + defined(__i386__) || defined(__aarch64__) || defined(__arm__)) +#define USE_BSWAP_NONSENSE +#endif + +#ifdef USE_BSWAP_NONSENSE +#if defined(_MSC_VER) && !defined(__clang__) +// MSVC prior to 2022 won't even optimise shift/mask swaps into a bswap +// instruction. Screw it, just use the intrinsics. +unsigned long _byteswap_ulong(unsigned long); +unsigned long long _byteswap_uint64(unsigned long long); +#define swap32 _byteswap_ulong +#define swap64 _byteswap_uint64 +#else +static inline unsigned int swap32(unsigned int x) { + return x >> 24 | x << 24 | x >> 8 & 0xFF00 | x << 8 & 0xFF0000; +} +static inline unsigned long long swap64(unsigned long long x) { + return x >> 56 | x << 56 | + x >> 40 & 0xFF00 | x << 40 & 0xFF000000000000 | + x >> 24 & 0xFF0000 | x << 24 & 0xFF0000000000 | + x >> 8 & 0xFF000000 | x << 8 & 0xFF00000000; +} +#endif +#endif + +static inline void doput16(unsigned char *out, unsigned short val) { +#ifdef USE_BSWAP_NONSENSE + // Use swap32() here because x86 and ARM don't have instructions for 16-bit + // swaps, and Clang doesn't realise it could just use the 32-bit one anyway. + *(unsigned short *)(out + 1) = swap32(val) >> 16; +#else + out[1] = val >> 8; out[2] = val; +#endif +} + +static inline void doput32(unsigned char *out, unsigned int val) { +#ifdef USE_BSWAP_NONSENSE + *(unsigned int *)(out + 1) = swap32(val); +#else + out[1] = val >> 24; out[2] = val >> 16; out[3] = val >> 8; out[4] = val; +#endif +} + +static inline void doput64(unsigned char *out, unsigned int val) { +#ifdef USE_BSWAP_NONSENSE + // Clang is smart enough to make this into two bswaps and a word swap in + // 32-bit builds. MSVC seems to be fine too when using the above intrinsics. + *(unsigned long long *)(out + 1) = swap64(val); +#else + out[1] = val >> 56; out[2] = val >> 48; + out[3] = val >> 40; out[4] = val >> 32; + out[5] = val >> 24; out[6] = val >> 16; + out[7] = val >> 8; out[8] = val; +#endif +} + +void msg_putnil(unsigned char *out) { + *out = 0xC0; +} + +void msg_putbool(unsigned char *out, _Bool val) { + *out = 0xC2 | val; +} + +void msg_puti7(unsigned char *out, signed char val) { + *out = val; // oh, so a fixnum is just the literal byte! genius! +} + +int msg_puts8(unsigned char *out, signed char val) { + int off = val < -32; // out of -ve fixnum range? + out[0] = 0xD0; + out[off] = val; + return off + 1; +} + +int msg_putu8(unsigned char *out, unsigned char val) { + int off = val > 127; // out of +ve fixnum range? + out[0] = 0xCC; + out[off] = val; + return off + 1; +} + +int msg_puts16(unsigned char *out, short val) { + if (val >= -128 && val <= 127) return msg_puts8(out, val); + out[0] = 0xD1; + doput16(out, val); + return 3; +} + +int msg_putu16(unsigned char *out, unsigned short val) { + if (val <= 255) return msg_putu8(out, val); + out[0] = 0xCD; + doput16(out, val); + return 3; +} + +int msg_puts32(unsigned char *out, int val) { + if (val >= -32768 && val <= 32767) return msg_puts16(out, val); + out[0] = 0xD2; + doput32(out, val); + return 5; +} + +int msg_putu32(unsigned char *out, unsigned int val) { + if (val <= 65535) return msg_putu16(out, val); + out[0] = 0xCE; + doput32(out, val); + return 5; +} + +int msg_puts(unsigned char *out, long long val) { + if (val >= -2147483648 && val <= 2147483647) { + return msg_puts32(out, val); + } + out[0] = 0xD3; + doput64(out, val); + return 9; +} + +int msg_putu(unsigned char *out, unsigned long long val) { + if (val <= 4294967295) return msg_putu32(out, val); + out[0] = 0xCF; + doput64(out, val); + return 9; +} + +static inline unsigned int floatbits(float f) { + return (union { float f; unsigned int i; }){f}.i; +} + +static inline unsigned long long doublebits(double d) { + return (union { double d; unsigned long long i; }){d}.i; +} + +void msg_putf(unsigned char *out, float val) { + out[0] = 0xCA; + doput32(out, floatbits(val)); +} + +int msg_putd(unsigned char *out, double val) { + // XXX: is this really the most efficient way to check this? + float f = val; + if ((double)f == val) { msg_putf(out, f); return 5; } + out[0] = 0xCA; + doput64(out, doublebits(val)); + return 9; +} + +void msg_putssz5(unsigned char *out, int sz) { + *out = 0xA0 | sz; +} + +int msg_putssz8(unsigned char *out, int sz) { + if (sz < 64) { msg_putssz5(out, sz); return 1; } + out[0] = 0xD9; + out[1] = sz; + return 2; +} + +int msg_putssz16(unsigned char *out, int sz) { + if (sz < 256) return msg_putssz8(out, sz); + out[0] = 0xDA; + doput16(out, sz); + return 3; +} + +int msg_putssz(unsigned char *out, unsigned int sz) { + if (sz < 65536) return msg_putssz16(out, sz); + out[0] = 0xDB; + doput32(out, sz); + return 5; +} + +void msg_putbsz8(unsigned char *out, int sz) { + out[0] = 0xC4; + out[1] = sz; +} + +int msg_putbsz16(unsigned char *out, int sz) { + if (sz < 256) { msg_putbsz8(out, sz); return 2; } + out[0] = 0xC5; + doput16(out, sz); + return 2 + sz; +} + +int msg_putbsz(unsigned char *out, unsigned int sz) { + if (sz < 65536) return msg_putbsz16(out, sz); + out[0] = 0xC6; + doput32(out, sz); + return 5; +} + +void msg_putasz4(unsigned char *out, int sz) { + *out = 0x90 | sz; +} + +int msg_putasz16(unsigned char *out, int sz) { + if (sz < 32) { msg_putasz4(out, sz); return 1; } + out[0] = 0xDC; + doput16(out, sz); + return 3; +} + +int msg_putasz(unsigned char *out, unsigned int sz) { + if (sz < 65536) return msg_putasz16(out, sz); + out[0] = 0xDD; + doput32(out, sz); + return 5; +} + +void msg_putmsz4(unsigned char *out, int sz) { + *out = 0x80 | sz; +} + +int msg_putmsz16(unsigned char *out, int sz) { + if (sz < 32) { msg_putmsz4(out, sz); return 1; } + out[0] = 0xDE; + doput16(out, sz); + return 3; +} + +int msg_putmsz(unsigned char *out, unsigned int sz) { + if (sz < 65536) return msg_putmsz16(out, sz); + out[0] = 0xDF; + doput32(out, sz); + return 5; +} + +// vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/chunklets/msg.h b/src/chunklets/msg.h new file mode 100644 index 0000000..b85bde3 --- /dev/null +++ b/src/chunklets/msg.h @@ -0,0 +1,350 @@ +/* + * Copyright © 2023 Michael Smith <mikesmiffy128@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef INC_CHUNKLETS_MSG_H +#define INC_CHUNKLETS_MSG_H + +#ifdef __cplusplus +#define _msg_Bool bool +extern "C" { +#else +#define _msg_Bool _Bool +#endif + +/* + * Writes a nil (null) message to the buffer out. Always writes a single byte. + * + * out must point to at least 1 byte. + */ +void msg_putnil(unsigned char *out); + +/* + * Writes the boolean val to the buffer out. Always writes a single byte. + * + * out must point to at least 1 byte. + */ +void msg_putbool(unsigned char *out, _msg_Bool val); + +/* + * Writes the integer val in the range [-32, 127] to the buffer out. Values + * outside this range will produce an undefined encoding. Always writes a single + * byte. + * + * out must point to at least 1 byte. + * + * It is recommended to use msg_puts() for arbitrary signed values or msg_putu() + * for arbitrary unsigned values. Those functions will produce the smallest + * possible encoding for any value. + */ +void msg_puti7(unsigned char *out, signed char val); + +/* + * Writes the signed int val in the range [-128, 127] to the buffer out. + * + * out must point to at least 2 bytes. + * + * Returns the number of bytes written, one of {1, 2}. + * + * It is recommended to use msg_puts() for arbitrary signed values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_puts8(unsigned char *out, signed char val); + +/* + * Writes the unsigned int val in the range [0, 255] to the buffer out. + * + * out must point to at least 2 bytes. + * + * Returns the number of bytes written, one of {1, 2}. + * + * It is recommended to use msg_putu() for arbitrary unsigned values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_putu8(unsigned char *out, unsigned char val); + +/* + * Writes the signed int val in the range [-65536, 65535] to the buffer out. + * + * out must point to at least 3 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3}. + * + * It is recommended to use msg_puts() for arbitrary signed values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_puts16(unsigned char *out, short val); + +/* + * Writes the unsigned int val in the range [0, 65536] to the buffer out. + * + * out must point to at least 3 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3}. + * + * It is recommended to use msg_putu() for arbitrary unsigned values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_putu16(unsigned char *out, unsigned short val); + +/* + * Writes the signed int val in the range [-2147483648, 2147483647] to the + * buffer out. + * + * out must point to at least 5 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3, 5}. + * + * It is recommended to use msg_puts() for arbitrary signed values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_puts32(unsigned char *out, int val); + +/* + * Writes the unsigned int val in the range [0, 4294967295] to the buffer out. + * + * out must point to at least 5 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3, 5}. + * + * It is recommended to use msg_putu() for arbitrary unsigned values. That + * function will produce the smallest possible encoding for any value. + */ +int msg_putu32(unsigned char *out, unsigned int val); + +/* + * Writes the signed int val in the range [-9223372036854775808, + * 9223372036854775807] to the buffer out. + * + * out must point to at least 9 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3, 5, 9}. + */ +int msg_puts(unsigned char *out, long long val); + +/* + * Writes the unsigned int val in the range [0, 18446744073709551616] to the + * buffer out. + * + * out must point to at least 9 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3, 5, 9}. + */ +int msg_putu(unsigned char *out, unsigned long long val); + +/* + * Writes the IEEE 754 single-precision float val to the buffer out. Always + * writes 5 bytes. + * + * out must point to at least 5 bytes. + */ +void msg_putf(unsigned char *out, float val); + +/* + * Writes the IEEE 754 double-precision float val to the buffer out. + * + * out must point to at least 9 bytes. + * + * Returns the number of bytes written, one of {5, 9}. + */ +int msg_putd(unsigned char *out, double val); + +/* + * Writes the string size sz in the range [0, 15] to the buffer out. Values + * outside this range will produce an undefined encoding. Always writes a single + * byte. + * + * In a complete message stream, a size of N must be immediately followed by N + * bytes of the actual string, which must be valid UTF-8. + * + * out must point to at least 1 byte. + * + * It is recommended to use msg_putssz() for arbitrary string sizes. That + * function will produce the smallest possible encoding for any size value. + */ +void msg_putssz5(unsigned char *out, int sz); + +/* + * Writes the string size sz in the range [0, 255] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by N + * bytes of the actual string, which must be valid UTF-8. + * + * out must point to at least 2 bytes. + * + * It is recommended to use msg_putssz() for arbitrary string sizes. That + * function will produce the smallest possible encoding for any size value. + */ +int msg_putssz8(unsigned char *out, int sz); + +/* + * Writes the string size sz in the range [0, 65535] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by N + * bytes of the actual string, which must be valid UTF-8. + * + * out must point to at least 3 bytes. + * + * It is recommended to use msg_putssz() for arbitrary string sizes. That + * function will produce the smallest possible encoding for any size value. + */ +int msg_putssz16(unsigned char *out, int sz); + +/* + * Writes the string size sz in the range [0, 4294967295] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by N + * bytes of the actual string, which must be valid UTF-8. + * + * out must point to at least 5 bytes. + */ +int msg_putssz(unsigned char *out, unsigned int sz); + +/* + * Writes the binary blob size sz in the range [0, 255] to the buffer out. + * Always writes 2 bytes. + * + * In a complete message stream, a size of N must be immediately followed by + * N bytes of the actual data. + * + * out must point to at least 2 bytes. + * + * It is recommended to use msg_putbsz() for arbitrary binary blob sizes. That + * function will produce the smallest possible encoding for any size value. + */ +void msg_putbsz8(unsigned char *out, int sz); + +/* + * Writes the binary blob size sz in the range [0, 65535] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by + * N bytes of the actual data. + * + * out must point to at least 3 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3}. + * + * It is recommended to use msg_putbsz() for arbitrary binary blob sizes. That + * function will produce the smallest possible encoding for any size value. + */ +int msg_putbsz16(unsigned char *out, int sz); + +/* + * Writes the binary blob size sz in the range [0, 4294967295] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by + * N bytes of the actual data. + * + * out must point to at least 5 bytes. + * + * Returns the number of bytes written, one of {1, 2, 3, 5}. + */ +int msg_putbsz(unsigned char *out, unsigned int sz); + +/* + * Writes the array size sz in the range [0, 15] to the buffer out. Values + * outside this range will produce an undefined encoding. Always writes a single + * byte. + * + * In a complete message stream, a size of N must be immediately followed by N + * other messages, which form the contents of the array. + * + * out must point to at least 1 byte. + * + * It is recommended to use msg_putasz() for arbitrary array sizes. That + * function will produce the smallest possible encoding for any size value. + */ +void msg_putasz4(unsigned char *out, int sz); + +/* + * Writes the array size sz in the range [0, 65535] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by N + * other messages, which form the contents of the array. + * + * out must point to at least 3 bytes. + * + * Returns the number of bytes written, one of {1, 3}. + * + * It is recommended to use msg_putasz() for arbitrary array sizes. That + * function will produce the smallest possible encoding for any size value. + */ +int msg_putasz16(unsigned char *out, int sz); + +/* + * Writes the array size sz in the range [0, 4294967295] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by N + * other messages, which form the contents of the array. + * + * out must point to at least 5 bytes. + * + * Returns the number of bytes written, one of {1, 3, 5}. + */ +int msg_putasz(unsigned char *out, unsigned int sz); + +/* + * Writes the map size sz in the range [0, 15] to the buffer out. Values + * outside this range will produce an undefined encoding. Always writes a single + * byte. + * + * In a complete message stream, a size of N must be immediately followed by + * N * 2 other messages, which form the contents of the map as keys followed by + * values in alternation. + * + * out must point to at least 1 byte. + * + * It is recommended to use msg_putmsz() for arbitrary map sizes. That function + * will produce the smallest possible encoding for any size value. + */ +void msg_putmsz4(unsigned char *out, int sz); + +/* + * Writes the array size sz in the range [0, 65536] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by + * N * 2 other messages, which form the contents of the map as keys followed by + * values in alternation. + * + * out must point to at least 3 bytes. + * + * Returns the number of bytes written, one of {1, 3}. + * + * It is recommended to use msg_putmsz() for arbitrary map sizes. That function + * will produce the smallest possible encoding for any size value. + */ +int msg_putmsz16(unsigned char *out, int sz); + +/* + * Writes the array size sz in the range [0, 4294967295] to the buffer out. + * + * In a complete message stream, a size of N must be immediately followed by + * N * 2 other messages, which form the contents of the map as keys followed by + * values in alternation. + * + * out must point to at least 5 bytes. + * + * Returns the number of bytes written, one of {1, 3, 5}. + */ +int msg_putmsz(unsigned char *out, unsigned int sz); + +#ifdef __cplusplus +} +#endif +#undef _msg_Bool + +#endif + +// vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/crypto.c b/src/crypto.c index f7ccd78..6d0f2aa 100644 --- a/src/crypto.c +++ b/src/crypto.c @@ -3,4 +3,28 @@ #include "3p/monocypher/monocypher.c" #include "3p/monocypher/monocypher-rng.c" +// -- SST-specific extensions to 4.0.1 API below -- +void crypto_aead_lock_djb(u8 *cipher_text, u8 mac[16], const u8 key[32], + const u8 nonce[8], const u8 *ad, size_t ad_size, + const u8 *plain_text, size_t text_size) +{ + crypto_aead_ctx ctx; + crypto_aead_init_djb(&ctx, key, nonce); + crypto_aead_write(&ctx, cipher_text, mac, ad, ad_size, + plain_text, text_size); + crypto_wipe(&ctx, sizeof(ctx)); +} + +int crypto_aead_unlock_djb(u8 *plain_text, const u8 mac[16], const u8 key[32], + const u8 nonce[8], const u8 *ad, size_t ad_size, + const u8 *cipher_text, size_t text_size) +{ + crypto_aead_ctx ctx; + crypto_aead_init_djb(&ctx, key, nonce); + int mismatch = crypto_aead_read(&ctx, plain_text, mac, ad, ad_size, + cipher_text, text_size); + crypto_wipe(&ctx, sizeof(ctx)); + return mismatch; +} + // vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/crypto.h b/src/crypto.h index 44d4fe2..7f0d607 100644 --- a/src/crypto.h +++ b/src/crypto.h @@ -6,6 +6,20 @@ #include "3p/monocypher/monocypher.h" #include "3p/monocypher/monocypher-rng.h" +// -- SST-specific extensions to 4.0.1 API below -- +void crypto_aead_lock_djb(uint8_t *cipher_text, + uint8_t mac [16], + const uint8_t key [32], + const uint8_t nonce[8], + const uint8_t *ad, size_t ad_size, + const uint8_t *plain_text, size_t text_size); +int crypto_aead_unlock_djb(uint8_t *plain_text, + const uint8_t mac [16], + const uint8_t key [32], + const uint8_t nonce[8], + const uint8_t *ad, size_t ad_size, + const uint8_t *cipher_text, size_t text_size); + #endif // vi: sw=4 ts=4 noet tw=80 cc=80 diff --git a/src/democustom.c b/src/democustom.c index 0ecbaa3..5dcbe01 100644 --- a/src/democustom.c +++ b/src/democustom.c @@ -14,9 +14,10 @@ * PERFORMANCE OF THIS SOFTWARE. */ +#include <string.h> + #include "bitbuf.h" #include "con_.h" -#include "democustom.h" #include "demorec.h" #include "engineapi.h" #include "errmsg.h" @@ -26,6 +27,8 @@ #include "mem.h" #include "ppmagic.h" #include "vcall.h" +#include "x86.h" +#include "x86util.h" FEATURE() REQUIRE(demorec) @@ -34,72 +37,64 @@ REQUIRE_GAMEDATA(vtidx_RecordPacket) static int nbits_msgtype, nbits_datalen; -// The engine allows usermessages up to 255 bytes, we add 2 bytes of overhead, -// and then there's the leading bits before that too (see create_message) -static char bb_buf[DEMOCUSTOM_MSG_MAX + 4]; +// engine limit is 255, we use 2 bytes for header + round the bitstream to the +// next whole byte, which gives 3 bytes overhead hence 252 here. +#define CHUNKSZ 252 + +static union { + char x[CHUNKSZ + /*7*/ 8]; // needs to be multiple of of 4! + bitbuf_cell _align; // just in case... +} bb_buf; static struct bitbuf bb = { - bb_buf, sizeof(bb_buf), sizeof(bb_buf) * 8, 0, false, false, "SST" + {bb_buf.x}, sizeof(bb_buf), sizeof(bb_buf) * 8, 0, false, false, "SST" }; -static void create_message(struct bitbuf *msg, const void *buf, int len) { - // The way we pack our custom demo data is via a user message packet with - // type "HudText" - this causes the client to do a text lookup which will - // simply silently fail on invalid keys. By making the first byte null - // (creating an empty string), we get the rest of the packet to stick in - // whatever other data we want. +static const void *createhdr(struct bitbuf *msg, int len, bool last) { + // We pack custom data into user message packets of type "HudText," with a + // leading null byte which the engine treats as an empty string. On demo + // playback, the client does a text lookup which fails silently on invalid + // keys, giving us the rest of the packet to stick in whatever data we want. // - // Notes from Uncrafted: - // > But yeah the data you want to append is as follows: - // > - 6 bits (5 bits in older versions) for the message type - should be 23 - // > for user message - bitbuf_appendbits(msg, 23, nbits_msgtype); - // > - 1 byte for the user message type - should be 2 for HudText - bitbuf_appendbyte(msg, 2); - // > - ~~an int~~ 11 or 12 bits for the length of your data in bits, - bitbuf_appendbits(msg, len * 8, nbits_datalen); // NOTE: assuming len <= 254 - // > - your data - // [first the aforementioned null byte, plus an arbitrary marker byte to - // avoid confusion when parsing the demo later... - bitbuf_appendbyte(msg, 0); - bitbuf_appendbyte(msg, 0xAC); - // ... and then just the data itself] - bitbuf_appendbuf(msg, buf, len); - // Thanks Uncrafted, very cool! + // Big thanks to our resident demo expert, Uncrafted, for explaining what to + // do here way back when this was first being figured out! + bitbuf_appendbits(msg, 23, nbits_msgtype); // type: 23 is user message + bitbuf_appendbyte(msg, 2); // user message type: 2 is HudText + bitbuf_appendbits(msg, len * 8, nbits_datalen); // our data length in bits + bitbuf_appendbyte(msg, 0); // aforementionied null byte + bitbuf_appendbyte(msg, 0xAC + last); // arbitrary marker byte to aid parsing + // store the data itself byte-aligned so there's no need to bitshift the + // universe (which would be both slower and more annoying to do) + bitbuf_roundup(msg); + return msg->buf + (msg->nbits >> 3); } typedef void (*VCALLCONV WriteMessages_func)(void *this, struct bitbuf *msg); static WriteMessages_func WriteMessages = 0; void democustom_write(const void *buf, int len) { - create_message(&bb, buf, len); + for (; len > CHUNKSZ; len -= CHUNKSZ) { + createhdr(&bb, CHUNKSZ, false); + memcpy(bb.buf + (bb.nbits >> 3), buf, CHUNKSZ); + bb.nbits += CHUNKSZ << 3; + WriteMessages(demorecorder, &bb); + bitbuf_reset(&bb); + } + createhdr(&bb, len, true); + memcpy(bb.buf + (bb.nbits >> 3), buf, len); + bb.nbits += len << 3; WriteMessages(demorecorder, &bb); bitbuf_reset(&bb); } static bool find_WriteMessages(void) { - // TODO(compat): rewrite this to just scan for a call instruction! const uchar *insns = (*(uchar ***)demorecorder)[vtidx_RecordPacket]; - // RecordPacket calls WriteMessages pretty much right away: - // 56 push esi - // 57 push edi - // 8B F1 mov esi,ecx - // 8D BE lea edi,[esi + 0x68c] - // 8C 06 00 00 - // 57 push edi - // E8 call CDemoRecorder_WriteMessages - // B0 EF FF FF - // So we just double check the byte pattern... - static const uchar bytes[] = -#ifdef _WIN32 - HEXBYTES(56, 57, 8B, F1, 8D, BE, 8C, 06, 00, 00, 57, E8); -#else -#warning This is possibly different on Linux too, have a look! - {-1, -1, -1, -1, -1, -1}; -#endif - if (!memcmp(insns, bytes, sizeof(bytes))) { - ssize off = mem_loadoffset(insns + sizeof(bytes)); - WriteMessages = (WriteMessages_func)(insns + sizeof(bytes) + 4 + off); - return true; + // RecordPacket calls WriteMessages right away, so just look for a call + for (const uchar *p = insns; p - insns < 32;) { + if (*p == X86_CALL) { + WriteMessages = (WriteMessages_func)(p + 5 + mem_loadoffset(p + 1)); + return true; + } + NEXT_INSN(p, "WriteMessages function"); } return false; } @@ -108,23 +103,21 @@ DECL_VFUNC_DYN(int, GetEngineBuildNumber) INIT { // More UncraftedkNowledge: - // > yeah okay so [the usermessage length is] 11 bits if the demo protocol - // > is 11 or if the game is l4d2 and the network protocol is 2042. - // > otherwise it's 12 bits - // > there might be some other l4d2 versions where it's 11 but idk + // - usermessage length is: + // - 11 bits in protocol 11, or l4d2 protocol 2042 + // - otherwise 12 bits // So here we have to figure out the network protocol version! // NOTE: assuming engclient != null as GEBN index relies on client version int buildnum = GetEngineBuildNumber(engclient); - // condition is redundant until other GetEngineBuildNumber offsets are added - // if (GAMETYPE_MATCHES(L4D2)) { + //if (GAMETYPE_MATCHES(L4D2)) { // redundant until we add more GEBN offsets! nbits_msgtype = 6; // based on Some Code I Read, buildnum *should* be the protocol version, // however L4D2 returns the actual game version instead, because sure // why not. The only practical difference though is that the network - // protocol froze after 2042, so we just have to do a >=. No big deal - // really. + // protocol froze after 2042, so we just have to do a >=. Fair enough! + // TODO(compat): how does TLS affect this? no idea yet if (buildnum >= 2042) nbits_datalen = 11; else nbits_datalen = 12; - // } + //} return find_WriteMessages(); } diff --git a/src/democustom.h b/src/democustom.h index a0a28b8..7f0c25e 100644 --- a/src/democustom.h +++ b/src/democustom.h @@ -17,12 +17,9 @@ #ifndef INC_DEMOCUSTOM_H #define INC_DEMOCUSTOM_H -/* maximum length of a custom demo message, in bytes */ -#define DEMOCUSTOM_MSG_MAX 253 - /* - * Write a block of up to DEMOWRITER_MSG_MAX bytes into the currently recording - * demo - NOT bounds checked, caller MUST ensure length is okay! + * Writes a custom demo message, automatically splitting into multiple demo + * packets if too long. Assumes a demo is currently being recorded. */ void democustom_write(const void *buf, int len); diff --git a/src/demorec.c b/src/demorec.c index e176ab3..e728ca5 100644 --- a/src/demorec.c +++ b/src/demorec.c @@ -50,6 +50,10 @@ bool demorec_forceauto = false; #define SIGNONSTATE_SPAWN 5 #define SIGNONSTATE_FULL 6 +DEF_PREDICATE(DemoControlAllowed, void) +DEF_EVENT(DemoRecordStarting, void) +DEF_EVENT(DemoRecordStopped, int) + typedef void (*VCALLCONV SetSignonState_func)(void *, int); static SetSignonState_func orig_SetSignonState; static void VCALLCONV hook_SetSignonState(void *this_, int state) { @@ -83,6 +87,9 @@ static void VCALLCONV hook_StopRecording(void *this) { *recording = true; *demonum = lastnum; } + else { + EMIT_DemoRecordStopped(lastnum); + } } DECL_VFUNC_DYN(void, StartRecording) @@ -90,10 +97,8 @@ DECL_VFUNC_DYN(void, StartRecording) static struct con_cmd *cmd_record, *cmd_stop; static con_cmdcb orig_record_cb, orig_stop_cb; -DEF_PREDICATE(AllowDemoControl, void) - static void hook_record_cb(const struct con_cmdargs *args) { - if (!CHECK_AllowDemoControl()) return; + if (!CHECK_DemoControlAllowed()) return; bool was = *recording; if (!was && args->argc == 2 || args->argc == 3) { // safety check: make sure a directory exists, otherwise recording @@ -152,10 +157,11 @@ static void hook_record_cb(const struct con_cmdargs *args) { // mike: I think this is questionably necessary but I'm outvoted :) con_msg("Demo recording started\n"); } + EMIT_DemoRecordStarting(); } static void hook_stop_cb(const struct con_cmdargs *args) { - if (!CHECK_AllowDemoControl()) return; + if (!CHECK_DemoControlAllowed()) return; wantstop = true; orig_stop_cb(args); wantstop = false; @@ -229,6 +235,7 @@ bool demorec_start(const char *name) { struct con_cmdargs args = {.argc = 2, .argv = {0, name, 0}}; orig_record_cb(&args); if (!was && *recording) *demonum = 0; // same logic as in the hook + EMIT_DemoRecordStarting(); return *recording; } @@ -237,11 +244,12 @@ int demorec_stop(void) { // making this correct when recording and stopping in the menu lol int ret = *demonum; orig_StopRecording(demorecorder); + EMIT_DemoRecordStopped(ret); return ret; } -bool demorec_recording(void) { - return *recording; +int demorec_demonum(void) { + return *recording ? *demonum : -1; } INIT { diff --git a/src/demorec.h b/src/demorec.h index 2de4f24..b53896f 100644 --- a/src/demorec.h +++ b/src/demorec.h @@ -52,21 +52,38 @@ bool demorec_start(const char *name); /* * Stops recording the current demo and returns the number of demos recorded * (the first will have the original basename + .dem extension; the rest will - * have the _N.dem suffixes). + * have the _N.dem suffixes). Value will be zero if the recording is stopped + * before the game has even gotten a chance to create the first demo file. */ int demorec_stop(void); /* - * Queries whether a demo is currently being recorded. + * Returns the current number in the recording sequence, or -1 if not recording. + * Value may be 0 if recording was requested but has yet to start (say, because + * we have yet to join a map). */ -bool demorec_recording(void); +int demorec_demonum(void); /* * Used to determine whether to allow usage of the normal record and stop * commands. Code which takes over control of demo recording can use this to * block the user from interfering. */ -DECL_PREDICATE(AllowDemoControl) +DECL_PREDICATE(DemoControlAllowed, void) + +/* + * Emitted whenever a recording session is about to be started, as a result of + * either the record command or a call to the demorec_start() function. A demo + * file won't actually have been created yet; this merely indicates that a + * request to record has happened. + */ +DECL_EVENT(DemoRecordStarting, void) + +/* + * Emitted when the current demo or series of demos has finished recording. + * Receives the number of recorded demo files (which could be 0) as an argument. + */ +DECL_EVENT(DemoRecordStopped, int) #endif diff --git a/src/engineapi.c b/src/engineapi.c index f4a54d6..24a2d6b 100644 --- a/src/engineapi.c +++ b/src/engineapi.c @@ -41,6 +41,7 @@ DECL_VFUNC(void *, GetGlobalVars, 1) // seems to be very stable, thank goodness void *globalvars; void *inputsystem, *vgui; +struct CServerPlugin *pluginhandler; DECL_VFUNC_DYN(void *, GetAllServerClasses) @@ -48,6 +49,7 @@ DECL_VFUNC_DYN(void *, GetAllServerClasses) bool engineapi_init(int pluginver) { if (!con_detect(pluginver)) return false; + pluginhandler = factory_engine("ISERVERPLUGINHELPERS001", 0); if (engclient = factory_engine("VEngineClient015", 0)) { _gametype_tag |= _gametype_tag_Client015; diff --git a/src/engineapi.h b/src/engineapi.h index d740c8c..fbc062b 100644 --- a/src/engineapi.h +++ b/src/engineapi.h @@ -136,6 +136,23 @@ extern struct VEngineServer *engserver; extern void *globalvars; extern void *inputsystem, *vgui; +// XXX: not exactly engine *API* but not curently clear where else to put this +struct CPlugin { + char description[128]; + bool paused; + void *theplugin; // our own "this" pointer (or whichever other plugin it is) + int ifacever; + // should be the plugin library, but in old Source branches it's just null, + // because CServerPlugin::Load() erroneously shadows this field with a local + void *module; +}; +struct CServerPlugin /* : IServerPluginHelpers */ { + void **vtable; + struct CUtlVector plugins; + /*IPluginHelpersCheck*/ void *pluginhlpchk; +}; +extern struct CServerPlugin *pluginhandler; + /* * Called on plugin init to attempt to initialise various core interfaces. * This includes console/cvar initialisation and populating gametype and diff --git a/src/os-win32.h b/src/os-win32.h index a006083..db20964 100644 --- a/src/os-win32.h +++ b/src/os-win32.h @@ -50,10 +50,10 @@ typedef unsigned short os_char; #define OS_MAIN wmain -static inline void *os_dlopen(const ushort *name) { +static inline void *os_dlopen(const unsigned short *name) { return LoadLibraryW(name); } -static inline void *os_dlhandle(const ushort *name) { +static inline void *os_dlhandle(const unsigned short *name) { return GetModuleHandleW(name); } static inline void *os_dlsym(void *m, const char *s) { diff --git a/src/rinput.c b/src/rinput.c index 6b6d4d7..79b6661 100644 --- a/src/rinput.c +++ b/src/rinput.c @@ -64,7 +64,7 @@ DEF_CVAR_UNREG(m_rawinput, "Use Raw Input for mouse input (SST reimplementation) DEF_CVAR_MINMAX(sst_mouse_factor, "Number of hardware mouse counts per step", 1, 1, 20, /*CON_ARCHIVE |*/ CON_HIDDEN) -static ssize __stdcall inproc(void *wnd, uint msg, ssize wp, ssize lp) { +static ssize __stdcall inproc(void *wnd, uint msg, usize wp, ssize lp) { switch (msg) { case WM_INPUT:; char buf[sizeof(RAWINPUTHEADER) + sizeof(RAWMOUSE) /* = 40 */]; @@ -30,6 +30,7 @@ #include "gametype.h" #include "hook.h" #include "os.h" +#include "sst.h" #include "vcall.h" #include "version.h" @@ -46,10 +47,13 @@ static int ifacever; void *clientlib = 0; bool sst_earlyloaded = false; // see deferinit() below +bool sst_userunloaded = false; // see hook_plugin_unload_cb() below + +#define VDFBASENAME "SourceSpeedrunTools" #ifdef _WIN32 extern long __ImageBase; // this is actually the PE header struct but don't care -#define ownhandle() ((void *)&__ImageBase) +static inline void *ownhandle(void) { return &__ImageBase; } #else // sigh, _GNU_SOURCE crap. define here instead >:( typedef struct { @@ -59,16 +63,16 @@ typedef struct { void *dli_saddr; } Dl_info; int dladdr1(const void *addr, Dl_info *info, void **extra_info, int flags); -static inline void *ownhandle(void) { +static void *ownhandle(void) { + static void *cached = 0; Dl_info dontcare; - void *dl; - dladdr1((void *)&ownhandle, &dontcare, &dl, /*RTLD_DL_LINKMAP*/ 2); - return dl; + if (!cached) { + dladdr1((void *)&ownhandle, &dontcare, &cached, /*RTLD_DL_LINKMAP*/ 2); + } + return cached; } #endif -#define VDFBASENAME "SourceSpeedrunTools" - #ifdef _WIN32 // not a proper check, just a short-circuit check to avoid doing more work. static inline bool checksamedrive(const ushort *restrict path1, @@ -305,6 +309,45 @@ e: con_warn("!!! SOME FEATURES MAY BE BROKEN !!!\n"); return false; } +DEF_PREDICATE(AllowPluginLoading, bool) +DEF_EVENT(PluginLoaded, void) +DEF_EVENT(PluginUnloaded, void) + +static struct con_cmd *cmd_plugin_load, *cmd_plugin_unload; +static con_cmdcb orig_plugin_load_cb, orig_plugin_unload_cb; + +static int ownidx; // XXX: super hacky way of getting this to do_unload() + +static void hook_plugin_load_cb(const struct con_cmdargs *args) { + if (args->argc == 1) return; + if (!CHECK_AllowPluginLoading(true)) return; + orig_plugin_load_cb(args); + EMIT_PluginLoaded(); +} +static void hook_plugin_unload_cb(const struct con_cmdargs *args) { + if (args->argc == 1) return; + if (!CHECK_AllowPluginLoading(false)) return; + int idx = atoi(args->argv[1]); + struct CPlugin **plugins = pluginhandler->plugins.m.mem; + if (idx >= 0 && idx < pluginhandler->plugins.sz && + plugins[idx]->theplugin == &plugin_obj) { + sst_userunloaded = true; + ownidx = idx; +#ifdef __clang__ + // thanks clang for forcing use of return here and THEN warning about it +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpedantic" + __attribute__((musttail)) return orig_plugin_unload_cb(args); +#pragma clang diagnostic pop +#else +#error We are tied to clang without an assembly solution for this! +#endif + } + // if it's some other plugin being unloaded, we can keep doing stuff after + orig_plugin_unload_cb(args); + EMIT_PluginUnloaded(); +} + static bool do_load(ifacefactory enginef, ifacefactory serverf) { if (!hook_init()) { errmsg_warnsys("couldn't set up memory for function hooking"); @@ -326,46 +369,34 @@ static bool do_load(ifacefactory enginef, ifacefactory serverf) { *p++ = (void *)&nop_p_v; // OnEdictAllocated *p = (void *)&nop_p_v; // OnEdictFreed if (!deferinit()) { do_featureinit(); fixes_apply(); } + if (pluginhandler) { + cmd_plugin_load = con_findcmd("plugin_load"); + orig_plugin_load_cb = cmd_plugin_load->cb; + cmd_plugin_load->cb = &hook_plugin_load_cb; + cmd_plugin_unload = con_findcmd("plugin_unload"); + orig_plugin_unload_cb = cmd_plugin_unload->cb; + cmd_plugin_unload->cb = &hook_plugin_unload_cb; + } return true; } -struct CServerPlugin /* : IServerPluginHelpers */ { - void **vtable; - struct CUtlVector plugins; - /*IPluginHelpersCheck*/ void *pluginhlpchk; -}; -struct CPlugin { - char description[128]; - bool paused; - void *theplugin; // our own "this" pointer (or whichever other plugin it is) - int ifacever; - // should be the plugin library, but in old Source branches it's just null, - // because CServerPlugin::Load() erroneously shadows this field with a local - void *module; -}; - static void do_unload(void) { #ifdef _WIN32 // this is only relevant in builds that predate linux support - struct CServerPlugin *pluginhandler = - factory_engine("ISERVERPLUGINHELPERS001", 0); - if (pluginhandler) { // if not, oh well too bad we tried :^) - struct CPlugin **plugins = pluginhandler->plugins.m.mem; - int n = pluginhandler->plugins.sz; - for (struct CPlugin **pp = plugins; pp - plugins < n; ++pp) { - if ((*pp)->theplugin == (void *)&plugin_obj) { - // see comment in CPlugin above. setting this to the real handle - // right before the engine tries to unload us allows it to - // actually do so. in newer branches this is redundant but - // doesn't do any harm so it's just unconditional. - // NOTE: old engines ALSO just leak the handle and never call - // Unload() if Load() fails; can't really do anything about that - (*pp)->module = ownhandle(); - break; - } + if (pluginhandler) { // if not, oh well too bad :^) + cmd_plugin_load->cb = orig_plugin_load_cb; + cmd_plugin_unload->cb = orig_plugin_unload_cb; + if (sst_userunloaded) { + struct CPlugin **plugins = pluginhandler->plugins.m.mem; + // see comment in CPlugin above. setting this to the real handle + // right before the engine tries to unload us allows it to actually + // do so. in newer branches this is redundant but doesn't do any + // harm so it's just unconditional. NOTE: old engines ALSO just leak + // the handle and never call Unload() if Load() fails; can't really + // do anything about that. + plugins[ownidx]->module = ownhandle(); } } #endif - endfeatures(); #ifdef __linux__ if (clientlib) dlclose(clientlib); @@ -24,10 +24,17 @@ DECL_EVENT(ClientActive, struct edict */*player*/) DECL_EVENT(Tick, bool /*simulating*/) +DECL_PREDICATE(AllowPluginLoading, void) +DECL_EVENT(PluginLoaded, void) +DECL_EVENT(PluginUnloaded, void) + extern void *clientlib; /* occasionally useful: quick query to determine how sst was loaded */ extern bool sst_earlyloaded; +/* similar query for how we are being unloaded - ONLY valid during unload */ +// TODO(opt): we can skip a whole bunch of cleanup when exiting the game! +extern bool sst_userunloaded; #endif |