From 43c64eee8dd08d61d029be5a30c0edc098d282ab Mon Sep 17 00:00:00 2001
From: Michael Smith <mikesmiffy128@gmail.com>
Date: Sat, 7 Sep 2024 12:57:38 +0100
Subject: Un-break and re-fix x86

The last fix was, uh, not good. With any luck this is actually correct
now. Certainly, running many millions of test cases fails to find any
mismatch with udis, so it's at least a lot less wrong than it was.
---
 src/x86.c       |  4 ++--
 src/x86.h       | 17 +++++++++++------
 test/x86.test.c |  9 +++++++--
 tools/x86test.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 10 deletions(-)
 create mode 100644 tools/x86test.c

diff --git a/src/x86.c b/src/x86.c
index e0431d6..5399af8 100644
--- a/src/x86.c
+++ b/src/x86.c
@@ -25,7 +25,6 @@ static int mrmsib(const uchar *p, int addrlen) {
 	// But it's confusingly-written enough that the code I wrote before didn't
 	// work, so with any luck nobody will need to refer to it again and this is
 	// actually correct now. Fingers crossed.
-	if ((*p & 0xC6) == 0x06) return 3; // special case for disp16
 	if (addrlen == 4 || *p & 0xC0) {
 		int sib = addrlen == 4 && *p < 0xC0 && (*p & 7) == 4;
 		switch (*p & 0xC0) {
@@ -41,7 +40,7 @@ static int mrmsib(const uchar *p, int addrlen) {
 			case 0x80: return 1 + addrlen + sib;
 		}
 	}
-	if (addrlen == 2 && *p == 0x26) return 3;
+	if (addrlen == 2 && (*p & 0xC7) == 0x06) return 3;
 	return 1; // note: include the mrm itself in the byte count
 }
 
@@ -66,6 +65,7 @@ P:		X86_SEG_PREFIXES(CASES)
 		X86_OPS_1BYTE_NO(CASES) return pfxlen + 1;
 		X86_OPS_1BYTE_I8(CASES) operandlen = 1;
 		X86_OPS_1BYTE_IW(CASES) return pfxlen + 1 + operandlen;
+		X86_OPS_1BYTE_IWI(CASES) return pfxlen + 1 + addrlen;
 		X86_OPS_1BYTE_I16(CASES) return pfxlen + 3;
 		X86_OPS_1BYTE_MRM(CASES) return pfxlen + 1 + mrmsib(insn + 1, addrlen);
 		X86_OPS_1BYTE_MRM_I8(CASES) operandlen = 1;
diff --git a/src/x86.h b/src/x86.h
index 52e4f9b..b4df9c8 100644
--- a/src/x86.h
+++ b/src/x86.h
@@ -25,6 +25,9 @@
  */
 
 // XXX: no BOUND (0x62): ambiguous with EVEX prefix - can't be arsed!
+// XXX: no LES (0xC4) or DES (0xC5) either for similar reasons. better to report
+// an unknown instruction than to potentially misinterpret an AVX thing.
+// these are all legacy instructions that won't really be used much anyway.
 
 /* Instruction prefixes: segments */
 #define X86_SEG_PREFIXES(X) \
@@ -188,10 +191,6 @@
 	X(X86_XOREAXI,  0x35) \
 	X(X86_CMPEAXI,  0x3D) \
 	X(X86_PUSHIW,   0x68) \
-	X(X86_MOVALII,  0xA0) /* From offset (indirect) */ \
-	X(X86_MOVEAXII, 0xA1) /* From offset (indirect) */ \
-	X(X86_MOVIIAL,  0xA2) /* To offset (indirect) */ \
-	X(X86_MOVIIEAX, 0xA3) /* To offset (indirect) */ \
 	X(X86_TESTEAXI, 0xA9) \
 	X(X86_MOVEAXI,  0xB8) \
 	X(X86_MOVECXI,  0xB9) \
@@ -204,6 +203,13 @@
 	X(X86_CALL,     0xE8) \
 	X(X86_JMPIW,    0xE9)
 
+/* Single-byte opcodes with a word-sized immediate operand (indirect) */
+#define X86_OPS_1BYTE_IWI(X) \
+	X(X86_MOVALII,  0xA0) /* From offset (indirect) */ \
+	X(X86_MOVEAXII, 0xA1) /* From offset (indirect) */ \
+	X(X86_MOVIIAL,  0xA2) /* To offset (indirect) */ \
+	X(X86_MOVIIEAX, 0xA3) /* To offset (indirect) */ \
+
 /* Single-byte opcodes with 16-bit immediate operands, regardless of prefixes */
 #define X86_OPS_1BYTE_I16(X) \
 	X(X86_RETI16,  0xC2) \
@@ -259,8 +265,6 @@
 	X(X86_LEA,       0x8D) \
 	X(X86_MOVSM,     0x8E) /* Store 4 bytes to segment register */ \
 	X(X86_POPM,      0x8F) \
-	X(X86_LES,       0xC4) \
-	X(X86_LDS,       0xC5) \
 	X(X86_SHIFTM18,  0xD0) /* Shift/roll by 1 place */ \
 	X(X86_SHIFTM1W,  0xD1) /* Shift/roll by 1 place */ \
 	X(X86_SHIFTMCL8, 0xD2) /* Shift/roll by CL places */ \
@@ -297,6 +301,7 @@
 	X86_OPS_1BYTE_NO(X) \
 	X86_OPS_1BYTE_I8(X) \
 	X86_OPS_1BYTE_IW(X) \
+	X86_OPS_1BYTE_IWI(X) \
 	X86_OPS_1BYTE_I16(X) \
 	X86_OPS_1BYTE_MRM(X) \
 	X86_OPS_1BYTE_MRM_I8(X) \
diff --git a/test/x86.test.c b/test/x86.test.c
index c0c825a..bf6e6e8 100644
--- a/test/x86.test.c
+++ b/test/x86.test.c
@@ -37,9 +37,14 @@ TEST("mov AL, moff8 instructions should be decoded correctly") {
 	return true;
 }
 
-TEST("fiadd [off16] instructions should be decoded correctly") {
+TEST("16-bit MRM instructions should be decoded correctly") {
 	const uchar fiadd_off16[] = HEXBYTES(67, DA, 06, DF, 11);
-	return x86_len(fiadd_off16) == 5;
+	const uchar fld_tword[] = HEXBYTES(67, DB, 2E, 99, C4);
+	const uchar add_off16_bl[] = HEXBYTES(67, 00, 1E, F5, BB);
+	if (x86_len(fiadd_off16) != 5) return false;
+	if (x86_len(fld_tword) != 5) return false;
+	if (x86_len(add_off16_bl) != 5) return false;
+	return true;
 }
 
 // vi: sw=4 ts=4 noet tw=80 cc=80
diff --git a/tools/x86test.c b/tools/x86test.c
new file mode 100644
index 0000000..18fc72f
--- /dev/null
+++ b/tools/x86test.c
@@ -0,0 +1,43 @@
+/* This file is dedicated to the public domain. */
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "../src/udis86.h"
+#include "../src/udis86.c"
+#include "../src/intdefs.h"
+#include "../src/x86.h"
+#include "../src/x86.c"
+#include "../src/os.h"
+#include "../src/os.c"
+
+/*
+ * Quick hacked-up test program to more exhaustively test x86.c. This is not run
+ * as part of the build; it is just here for development and reference purposes.
+ */
+
+int main(void) {
+	uchar buf[15];
+	int bad = 0;
+	for (int i = 0; i < 100000000 && bad < 30; ++i) {
+		os_randombytes(buf, sizeof(buf));
+		struct ud u;
+		ud_init(&u);
+		ud_set_mode(&u, 32);
+		ud_set_input_buffer(&u, buf, sizeof(buf));
+		ud_set_syntax(&u, UD_SYN_INTEL);
+		int len = ud_disassemble(&u);
+		if (len && ud_insn_mnemonic(&u) != UD_Iinvalid) {
+			int mylen = x86_len(buf);
+			if (mylen != -1 && mylen != len) {
+				++bad;
+				fprintf(stderr, "Uh oh! %s\nExp: %d\nGot: %d\nBytes:",
+						ud_insn_asm(&u), len, mylen);
+				for (int i = 0; i < len; ++i) fprintf(stderr, " %02X", buf[i]);
+				fputs("\n\n", stderr);
+			}
+		}
+	}
+	fprintf(stderr, "%d bad cases\n", bad);
+}
+
-- 
cgit v1.2.3