Un-break and re-fix x86

The last fix was, uh, not good. With any luck this is actually correct now. Certainly, running many millions of test cases fails to find any mismatch with udis, so it's at least a lot less wrong than it was.
author: Michael Smith <mikesmiffy128@gmail.com> 2024-09-07 12:57:38 +0100
committer: Michael Smith <mikesmiffy128@gmail.com> 2024-09-07 12:57:38 +0100
commit: 43c64eee8dd08d61d029be5a30c0edc098d282ab (patch)
tree: a71e412b1fefd3abf89093ca4830a5cf3ba1c46e
parent: 8bb4226f07b1e9ee79f3429a1495eaa694b13334 (diff)
4 files changed, 63 insertions, 10 deletions
diff --git a/src/x86.c b/src/x86.c
index e0431d6..5399af8 100644
--- a/src/x86.c
+++ b/src/x86.c
@@ -25,7 +25,6 @@ static int mrmsib(const uchar *p, int addrlen) {
 	// But it's confusingly-written enough that the code I wrote before didn't
 	// work, so with any luck nobody will need to refer to it again and this is
 	// actually correct now. Fingers crossed.
-	if ((*p & 0xC6) == 0x06) return 3; // special case for disp16
 	if (addrlen == 4 || *p & 0xC0) {
 		int sib = addrlen == 4 && *p < 0xC0 && (*p & 7) == 4;
 		switch (*p & 0xC0) {
@@ -41,7 +40,7 @@ static int mrmsib(const uchar *p, int addrlen) {
 			case 0x80: return 1 + addrlen + sib;
 		}
 	}
-	if (addrlen == 2 && *p == 0x26) return 3;
+	if (addrlen == 2 && (*p & 0xC7) == 0x06) return 3;
 	return 1; // note: include the mrm itself in the byte count
 }
 
@@ -66,6 +65,7 @@ P:		X86_SEG_PREFIXES(CASES)
 		X86_OPS_1BYTE_NO(CASES) return pfxlen + 1;
 		X86_OPS_1BYTE_I8(CASES) operandlen = 1;
 		X86_OPS_1BYTE_IW(CASES) return pfxlen + 1 + operandlen;
+		X86_OPS_1BYTE_IWI(CASES) return pfxlen + 1 + addrlen;
 		X86_OPS_1BYTE_I16(CASES) return pfxlen + 3;
 		X86_OPS_1BYTE_MRM(CASES) return pfxlen + 1 + mrmsib(insn + 1, addrlen);
 		X86_OPS_1BYTE_MRM_I8(CASES) operandlen = 1;
diff --git a/src/x86.h b/src/x86.h
index 52e4f9b..b4df9c8 100644
--- a/src/x86.h
+++ b/src/x86.h
@@ -25,6 +25,9 @@
  */
 
 // XXX: no BOUND (0x62): ambiguous with EVEX prefix - can't be arsed!
+// XXX: no LES (0xC4) or DES (0xC5) either for similar reasons. better to report
+// an unknown instruction than to potentially misinterpret an AVX thing.
+// these are all legacy instructions that won't really be used much anyway.
 
 /* Instruction prefixes: segments */
 #define X86_SEG_PREFIXES(X) \
@@ -188,10 +191,6 @@
 	X(X86_XOREAXI,  0x35) \
 	X(X86_CMPEAXI,  0x3D) \
 	X(X86_PUSHIW,   0x68) \
-	X(X86_MOVALII,  0xA0) /* From offset (indirect) */ \
-	X(X86_MOVEAXII, 0xA1) /* From offset (indirect) */ \
-	X(X86_MOVIIAL,  0xA2) /* To offset (indirect) */ \
-	X(X86_MOVIIEAX, 0xA3) /* To offset (indirect) */ \
 	X(X86_TESTEAXI, 0xA9) \
 	X(X86_MOVEAXI,  0xB8) \
 	X(X86_MOVECXI,  0xB9) \
@@ -204,6 +203,13 @@
 	X(X86_CALL,     0xE8) \
 	X(X86_JMPIW,    0xE9)
 
+/* Single-byte opcodes with a word-sized immediate operand (indirect) */
+#define X86_OPS_1BYTE_IWI(X) \
+	X(X86_MOVALII,  0xA0) /* From offset (indirect) */ \
+	X(X86_MOVEAXII, 0xA1) /* From offset (indirect) */ \
+	X(X86_MOVIIAL,  0xA2) /* To offset (indirect) */ \
+	X(X86_MOVIIEAX, 0xA3) /* To offset (indirect) */ \
+
 /* Single-byte opcodes with 16-bit immediate operands, regardless of prefixes */
 #define X86_OPS_1BYTE_I16(X) \
 	X(X86_RETI16,  0xC2) \
@@ -259,8 +265,6 @@
 	X(X86_LEA,       0x8D) \
 	X(X86_MOVSM,     0x8E) /* Store 4 bytes to segment register */ \
 	X(X86_POPM,      0x8F) \
-	X(X86_LES,       0xC4) \
-	X(X86_LDS,       0xC5) \
 	X(X86_SHIFTM18,  0xD0) /* Shift/roll by 1 place */ \
 	X(X86_SHIFTM1W,  0xD1) /* Shift/roll by 1 place */ \
 	X(X86_SHIFTMCL8, 0xD2) /* Shift/roll by CL places */ \
@@ -297,6 +301,7 @@
 	X86_OPS_1BYTE_NO(X) \
 	X86_OPS_1BYTE_I8(X) \
 	X86_OPS_1BYTE_IW(X) \
+	X86_OPS_1BYTE_IWI(X) \
 	X86_OPS_1BYTE_I16(X) \
 	X86_OPS_1BYTE_MRM(X) \
 	X86_OPS_1BYTE_MRM_I8(X) \
diff --git a/test/x86.test.c b/test/x86.test.c
index c0c825a..bf6e6e8 100644
--- a/test/x86.test.c
+++ b/test/x86.test.c
@@ -37,9 +37,14 @@ TEST("mov AL, moff8 instructions should be decoded correctly") {
 	return true;
 }
 
-TEST("fiadd [off16] instructions should be decoded correctly") {
+TEST("16-bit MRM instructions should be decoded correctly") {
 	const uchar fiadd_off16[] = HEXBYTES(67, DA, 06, DF, 11);
-	return x86_len(fiadd_off16) == 5;
+	const uchar fld_tword[] = HEXBYTES(67, DB, 2E, 99, C4);
+	const uchar add_off16_bl[] = HEXBYTES(67, 00, 1E, F5, BB);
+	if (x86_len(fiadd_off16) != 5) return false;
+	if (x86_len(fld_tword) != 5) return false;
+	if (x86_len(add_off16_bl) != 5) return false;
+	return true;
 }
 
 // vi: sw=4 ts=4 noet tw=80 cc=80
diff --git a/tools/x86test.c b/tools/x86test.c
new file mode 100644
index 0000000..18fc72f
--- /dev/null
+++ b/tools/x86test.c
@@ -0,0 +1,43 @@
+/* This file is dedicated to the public domain. */
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "../src/udis86.h"
+#include "../src/udis86.c"
+#include "../src/intdefs.h"
+#include "../src/x86.h"
+#include "../src/x86.c"
+#include "../src/os.h"
+#include "../src/os.c"
+
+/*
+ * Quick hacked-up test program to more exhaustively test x86.c. This is not run
+ * as part of the build; it is just here for development and reference purposes.
+ */
+
+int main(void) {
+	uchar buf[15];
+	int bad = 0;
+	for (int i = 0; i < 100000000 && bad < 30; ++i) {
+		os_randombytes(buf, sizeof(buf));
+		struct ud u;
+		ud_init(&u);
+		ud_set_mode(&u, 32);
+		ud_set_input_buffer(&u, buf, sizeof(buf));
+		ud_set_syntax(&u, UD_SYN_INTEL);
+		int len = ud_disassemble(&u);
+		if (len && ud_insn_mnemonic(&u) != UD_Iinvalid) {
+			int mylen = x86_len(buf);
+			if (mylen != -1 && mylen != len) {
+				++bad;
+				fprintf(stderr, "Uh oh! %s\nExp: %d\nGot: %d\nBytes:",
+						ud_insn_asm(&u), len, mylen);
+				for (int i = 0; i < len; ++i) fprintf(stderr, " %02X", buf[i]);
+				fputs("\n\n", stderr);
+			}
+		}
+	}
+	fprintf(stderr, "%d bad cases\n", bad);
+}
+
author	Michael Smith <mikesmiffy128@gmail.com>	2024-09-07 12:57:38 +0100
committer	Michael Smith <mikesmiffy128@gmail.com>	2024-09-07 12:57:38 +0100
commit	43c64eee8dd08d61d029be5a30c0edc098d282ab (patch)
tree	a71e412b1fefd3abf89093ca4830a5cf3ba1c46e
parent	8bb4226f07b1e9ee79f3429a1495eaa694b13334 (diff)