27 files changed, 7700 insertions, 2 deletions
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 68cac59..f2b92b4 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -22,6 +22,7 @@
 #include "ARM.h"
 #include "ARMInterpreter.h"
 #include "AREngine.h"
+#include "ARMJIT.h"
 
 
 // instruction timing notes
@@ -524,7 +525,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        if (CPSR & 0x20) // THUMB
+        /*if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -557,7 +558,15 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }
+        }*/
+
+        if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4));
+        if (block == NULL)
+            block = ARMJIT::CompileBlock(this);
+        Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
new file mode 100644
index 0000000..489cdcf
--- /dev/null
+++ b/src/ARMJIT.cpp
@@ -0,0 +1,177 @@
+#include "ARMJIT.h"
+
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+
+namespace ARMJIT
+{
+
+Compiler* compiler;
+BlockCache cache;
+
+
+#define DUP2(x) x, x
+
+static ptrdiff_t JIT_MEM[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
+		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
+		/* 4X*/	DUP2(-1),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/		 -1, 
+					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
+		/* 1X*/	DUP2(-1),
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	     offsetof(BlockCache, SWRAM),
+		             offsetof(BlockCache, ARM7_WRAM),
+		/* 4X*/	     -1,
+		             offsetof(BlockCache, ARM7_WIRAM),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(-1)
+		}
+};
+
+static u32 JIT_MASK[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(0x00007FFF),
+		/* 1X*/	DUP2(0x00007FFF),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	DUP2(0x00007FFF),
+		/* 4X*/	DUP2(0x00000000),
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/		 0x00000000,
+					 0x000FFFFF,
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00007FFF)
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(0x00003FFF),
+		/* 1X*/	DUP2(0x00000000),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	     0x00007FFF,
+		             0x0000FFFF,
+		/* 4X*/	     0x00000000,
+		             0x0000FFFF,
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/ DUP2(0x0003FFFF),
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00000000)
+		}
+};
+
+#undef DUP2
+
+
+void Init()
+{
+    memset(&cache, 0, sizeof(BlockCache));
+
+    for (int cpu = 0; cpu < 2; cpu++)
+        for (int i = 0; i < 0x4000; i++)
+            cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL :
+				(CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9])
+                + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1);
+
+	compiler = new Compiler();
+}
+
+void DeInit()
+{
+	delete compiler;
+}
+
+CompiledBlock CompileBlock(ARM* cpu)
+{
+    bool thumb = cpu->CPSR & 0x20;
+
+    FetchedInstr instrs[12];
+    int i = 0;
+    u32 r15 = cpu->R[15];
+    u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+    //printf("block %x %d\n", r15, thumb);
+    do
+    {
+        r15 += thumb ? 2 : 4;
+
+        instrs[i].Instr = nextInstr[0];
+        //printf("%x %x\n", instrs[i].Instr, r15);
+        instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+
+        if (cpu->Num == 0)
+        {
+            ARMv5* cpuv5 = (ARMv5*)cpu;
+            if (thumb && r15 & 0x2)
+            {
+                nextInstr[1] >>= 16;
+                instrs[i].CodeCycles = 0;
+            }
+            else
+            {
+                nextInstr[1] = cpuv5->CodeRead32(r15, false);
+                instrs[i].CodeCycles = cpu->CodeCycles;
+            }
+        }
+        else
+        {
+            ARMv4* cpuv4 = (ARMv4*)cpu;
+            if (thumb)
+                nextInstr[1] = cpuv4->CodeRead16(r15);
+            else
+                nextInstr[1] = cpuv4->CodeRead32(r15);
+            instrs[i].CodeCycles = cpu->CodeCycles;
+        }
+        instrs[i].NextInstr[1] = nextInstr[1];
+        instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
+
+        i++;
+    } while(!instrs[i - 1].Info.Branches() && i < 10);
+
+    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+
+    InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block);
+
+    return block;
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
new file mode 100644
index 0000000..d718295
--- /dev/null
+++ b/src/ARMJIT.h
@@ -0,0 +1,140 @@
+#ifndef ARMJIT_H
+#define ARMJIT_H
+
+#include "types.h"
+
+#include <string.h>
+
+#include "ARM.h"
+#include "ARM_InstrInfo.h"
+
+namespace ARMJIT
+{
+
+typedef u32 (*CompiledBlock)();
+
+class RegCache
+{
+
+static const int NativeRegAllocOrder[];
+static const int NativeRegsCount;
+
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+    u32 Instr;
+    u32 NextInstr[2];
+
+    u8 CodeCycles;
+
+    ARMInstrInfo::Info Info;
+};
+
+/* 
+	Copied from DeSmuME
+	Some names where changed to match the nomenclature of melonDS
+
+	Since it's nowhere explained and atleast I needed some time to get behind it,
+	here's a summary on how it works:
+		more or less all memory locations from which code can be executed are
+		represented by an array of function pointers, which point to null or
+		a function which executes a block instructions starting from there.
+
+		The most significant 4 bits of each address is ignored. This 28 bit space is
+		divided into 0x4000 16 KB blocks, each of which a pointer to the relevant
+		place inside the before mentioned arrays. Only half of the bytes need to be
+		addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary).
+
+		In case a memory write hits mapped memory, the function block at this
+		address is set to null, so it's recompiled the next time it's executed.
+
+		This method has disadvantages, namely that only writing to the
+		first instruction of a block marks it as invalid and that memory remapping
+        (SWRAM and VRAM) isn't taken into account.
+*/
+
+struct BlockCache
+{
+    CompiledBlock* AddrMapping[2][0x4000] = {0};
+
+    CompiledBlock MainRAM[16*1024*1024/2];
+	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
+	CompiledBlock ARM9_ITCM[0x8000/2];
+	CompiledBlock ARM9_LCDC[0xA4000/2];
+	CompiledBlock ARM9_BIOS[0x8000/2];
+	CompiledBlock ARM7_BIOS[0x4000/2];
+	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
+	CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi
+	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
+};
+
+extern BlockCache cache;
+
+inline bool IsMapped(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+}
+
+inline CompiledBlock LookUpBlock(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+}
+
+inline void Invalidate16(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+		cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+}
+
+inline void Invalidate32(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+	{
+		CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+		page[(addr & 0x3FFF) >> 1] = NULL;
+		page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+	}
+}
+
+inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
+{
+	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+}
+
+inline void ResetBlocks()
+{
+	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
+	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
+	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
+	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
+	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
+	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
+	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
+	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
+	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+}
+
+void Init();
+void DeInit();
+
+CompiledBlock CompileBlock(ARM* cpu);
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..fb2fda8
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -0,0 +1,332 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include <assert.h>
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13};
+const int RegCache::NativeRegsCount = 5;
+
+Compiler::Compiler()
+{
+    AllocCodeSpace(1024 * 1024 * 4);
+}
+
+typedef void (Compiler::*CompileFunc)();
+typedef void (*InterpretFunc)(ARM*);
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+
+    MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
+}
+
+void Compiler::SaveCPSR()
+{
+    if (CPSRDirty)
+    {
+        MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
+        CPSRDirty = false;
+    }
+}
+
+CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+{
+    if (IsAlmostFull())
+    {
+        ResetBlocks();
+        ResetCodePtr();
+    }
+
+    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
+
+    ConstantCycles = 0;
+    Thumb = cpu->CPSR & 0x20;
+    Num = cpu->Num;
+    R15 = cpu->R[15];
+
+    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+
+    MOV(64, R(RCPU), ImmPtr(cpu));
+    XOR(32, R(RCycles), R(RCycles));
+
+    LoadCPSR();
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        R15 += Thumb ? 2 : 4;
+        CurrentInstr = instrs[i];
+
+        CompileFunc comp = NULL;
+
+        if (comp == NULL || i == instrsCount - 1)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr));
+            if (i == instrsCount - 1)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1]));
+            }
+
+            SaveCPSR();
+        }
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF;
+                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+            }
+            else
+            {
+            }
+        }
+        else
+        {
+            u32 cond = CurrentInstr.Cond();
+            if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+                ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+            }
+            else if (cond == 0xF)
+                AddCycles_C();
+            else
+            {
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                {
+                    if (cond >= 0x8)
+                    {
+                        static_assert(RSCRATCH3 == ECX);
+                        MOV(32, R(RSCRATCH3), R(RCPSR));
+                        SHR(32, R(RSCRATCH3), Imm8(28));
+                        MOV(32, R(RSCRATCH), Imm32(1));
+                        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+                        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+                    
+                        skipExecute = J_CC(CC_Z);
+                    }
+                    else
+                    {
+                        // could have used a LUT, but then where would be the fun?
+                        BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
+                        
+                        skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
+                    }
+                    
+                }
+
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                    u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0);
+                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                }
+                else
+                {
+                }
+
+                FixupBranch skipFailed;
+                if (CurrentInstr.Cond() < 0xE)
+                {
+                    skipFailed = J();
+                    SetJumpTarget(skipExecute);
+
+                    AddCycles_C();
+
+                    SetJumpTarget(skipFailed);
+                }
+            }
+        }
+
+        /*
+            we don't need to collect the interpreted cycles,
+            since all functions only add to it, the dispatcher
+            can take care of it.
+        */
+
+        if (comp == NULL && i != instrsCount - 1)
+            LoadCPSR();
+    }
+
+    SaveCPSR();
+
+    LEA(32, RAX, MDisp(RCycles, ConstantCycles));
+
+    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    RET();
+
+    return res;
+}
+
+void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
+{
+    const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+    {
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    };
+
+    const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL
+    };
+}
+
+void Compiler::AddCycles_C()
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles);
+
+    if (CurrentInstr.Cond() < 0xE)
+        ADD(32, R(RCycles), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+        case 0: // LSL
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHL(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+
+                return R(RSCRATCH);
+            }
+            else
+            {
+                carryUsed = false;
+                return R(rm);
+            }
+        case 1: // LSR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHR(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+                return R(RSCRATCH);
+            }
+            else
+            {
+                if (S)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                return Imm32(0);
+            }
+        case 2: // ASR
+            MOV(32, R(RSCRATCH), R(rm));
+            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            if (S)
+            {
+                if (amount == 0)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                else
+                    SETcc(CC_C, R(RSCRATCH2));
+            }
+            return R(RSCRATCH);
+        case 3: // ROR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                ROR_(32, R(RSCRATCH), Imm8(amount));
+            }
+            else
+            {
+                BT(32, R(RCPSR), Imm8(29));
+                MOV(32, R(RSCRATCH), R(rm));
+                RCR(32, R(RSCRATCH), Imm8(1));
+            }
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+    }
+}
+
+void Compiler::A_Comp_ALU(const FetchedInstr& instr)
+{
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..8e1d100
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -0,0 +1,54 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../dolphin/x64Emitter.h"
+
+#include "../ARMJIT.h"
+
+
+namespace ARMJIT
+{
+
+const Gen::X64Reg RCPU = Gen::RBP;
+const Gen::X64Reg RCycles = Gen::R14;
+const Gen::X64Reg RCPSR = Gen::R15;
+
+const Gen::X64Reg RSCRATCH = Gen::EAX;
+const Gen::X64Reg RSCRATCH2 = Gen::EDX;
+const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+
+class Compiler : public Gen::X64CodeBlock
+{
+public:
+    Compiler();
+
+    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+
+    void StartBlock(ARM* cpu);
+    CompiledBlock FinaliseBlock();
+
+    void Compile(RegCache& regs, const FetchedInstr& instr);
+private:
+    void AddCycles_C();
+
+    Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed);
+
+    void A_Comp_ALU(const FetchedInstr& instr);
+
+    void LoadCPSR();
+    void SaveCPSR();
+
+    bool CPSRDirty = false;
+
+    FetchedInstr CurrentInstr;
+
+    bool Thumb;
+    u32 Num;
+    u32 R15;
+
+    u32 ConstantCycles;
+};
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
new file mode 100644
index 0000000..41c46e1
--- /dev/null
+++ b/src/ARM_InstrInfo.cpp
@@ -0,0 +1,376 @@
+#include "ARM_InstrInfo.h"
+
+#include <stdio.h>
+
+namespace ARMInstrInfo
+{
+
+#define ak(x) ((x) << 13)
+
+enum {
+    A_Read0             = 1 << 0,
+    A_Read16            = 1 << 1,
+    A_Read8             = 1 << 2,
+    A_Read12            = 1 << 3,
+
+    A_Write12           = 1 << 4,
+    A_Write16           = 1 << 5,
+    A_MemWriteback      = 1 << 6,
+
+    A_BranchAlways      = 1 << 7,
+
+    // for STRD/LDRD
+    A_Read12Double      = 1 << 8,
+    A_Write12Double     = 1 << 9,
+
+    A_Link              = 1 << 10,
+
+    A_LDMSTM            = 1 << 11,
+
+    A_ARM9Only          = 1 << 12,
+};
+
+#define A_BIOP A_Read16
+#define A_MONOOP 0
+
+#define A_IMPLEMENT_ALU_OP(x,k) \
+    const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+    \
+    const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP)
+A_IMPLEMENT_ALU_OP(EOR,BIOP)
+A_IMPLEMENT_ALU_OP(SUB,BIOP)
+A_IMPLEMENT_ALU_OP(RSB,BIOP)
+A_IMPLEMENT_ALU_OP(ADD,BIOP)
+A_IMPLEMENT_ALU_OP(ADC,BIOP)
+A_IMPLEMENT_ALU_OP(SBC,BIOP)
+A_IMPLEMENT_ALU_OP(RSC,BIOP)
+A_IMPLEMENT_ALU_OP(ORR,BIOP)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP)
+A_IMPLEMENT_ALU_OP(BIC,BIOP)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP)
+
+const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
+
+#define A_IMPLEMENT_ALU_TEST(x) \
+    const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST)
+A_IMPLEMENT_ALU_TEST(TEQ)
+A_IMPLEMENT_ALU_TEST(CMP)
+A_IMPLEMENT_ALU_TEST(CMN)
+
+const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
+const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
+const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
+const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
+
+const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ);
+
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB);
+
+#define A_LDR A_Write12
+#define A_STR A_Read12
+
+#define A_IMPLEMENT_WB_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
+    const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
+    const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
+    const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
+    const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+
+A_IMPLEMENT_WB_LDRSTR(STR,STR)
+A_IMPLEMENT_WB_LDRSTR(STRB,STR)
+A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
+A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
+
+#define A_LDRD A_Write12Double
+#define A_STRD A_Read12Double
+
+#define A_IMPLEMENT_HD_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
+
+A_IMPLEMENT_HD_LDRSTR(STRH,STR)
+A_IMPLEMENT_HD_LDRSTR(LDRD,LDRD)
+A_IMPLEMENT_HD_LDRSTR(STRD,STRD)
+A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
+
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
+
+const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM);
+
+const u32 A_B = A_BranchAlways | ak(ak_B);
+const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
+const u32 A_BLX_IMM = A_BranchAlways | A_Link | ak(ak_BLX_IMM);
+const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
+const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
+
+const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
+const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC);
+const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
+
+// THUMB
+
+#define tk(x) ((x) << 16)
+
+enum {
+    T_Read0         = 1 << 0,
+    T_Read3         = 1 << 1,
+    T_Read6         = 1 << 2,
+    T_Read8         = 1 << 3,
+
+    T_Write0        = 1 << 4,
+    T_Write8        = 1 << 5,
+
+    T_ReadHi0       = 1 << 6,
+    T_ReadHi3       = 1 << 7,
+    T_WriteHi0      = 1 << 8,
+
+    T_ReadR13       = 1 << 9,
+    T_WriteR13      = 1 << 10,
+    T_ReadR15       = 1 << 11,
+
+    T_BranchAlways  = 1 << 12,
+    T_ReadR14       = 1 << 13,
+    T_WriteR14      = 1 << 14,
+
+    T_PopPC         = 1 << 15
+};
+
+const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG);
+
+const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
+const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
+
+const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL);
+const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
+const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
+
+const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
+
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
+
+const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
+
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
+
+const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
+
+const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+
+const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
+const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
+const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_B = T_BranchAlways | tk(tk_B);
+const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
+
+const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC);
+
+#define INSTRFUNC_PROTO(x) u32 x
+#include "ARM_InstrTable.h"
+#undef INSTRFUNC_PROTO
+
+Info Decode(bool thumb, u32 num, u32 instr)
+{
+    Info res = {0};
+    if (thumb)
+    {
+        u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+
+        if (data & T_Read0)
+            res.SrcRegs |= 1 << (instr & 0x7);
+        if (data & T_Read3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0x7);
+        if (data & T_Read6)
+            res.SrcRegs |= 1 << ((instr >> 6) & 0x7);
+        if (data & T_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0x7);
+
+        if (data & T_Write0)
+            res.DstRegs |= 1 << (instr & 0x7);
+        if (data & T_Write8)
+            res.DstRegs |= 1 << ((instr >> 8) & 0x7);
+        
+        if (data & T_ReadHi0)
+            res.SrcRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+        if (data & T_ReadHi3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0xF);
+        if (data & T_WriteHi0)
+            res.DstRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+
+        if (data & T_ReadR13)
+            res.SrcRegs |= (1 << 13);
+        if (data & T_WriteR13)
+            res.DstRegs |= (1 << 13);
+        if (data & T_ReadR15)
+            res.SrcRegs |= (1 << 15);
+
+        if (data & T_BranchAlways)
+            res.DstRegs |= (1 << 15);
+
+        if (data & T_PopPC && instr & (1 << 8))
+            res.DstRegs |= 1 << 15;
+
+        res.Kind = (data >> 16) & 0x3F;
+
+        return res;
+    }
+    else
+    {
+        u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
+        if ((instr & 0xFE000000) == 0xFA000000)
+            data = A_BLX_IMM;
+
+        if (data & A_ARM9Only && num != 0)
+            data |= A_BranchAlways | A_Link;
+
+        if (data & A_Read0)
+            res.SrcRegs |= 1 << (instr & 0xF);
+        if (data & A_Read16)
+            res.SrcRegs |= 1 << ((instr >> 16) & 0xF);
+        if (data & A_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0xF);
+        if (data & A_Read12)
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+        
+        if (data & A_Write12)
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+        if (data & A_Write16)
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        
+        if (data & A_MemWriteback && instr & (1 << 21))
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+
+        if (data & A_BranchAlways)
+            res.DstRegs |= 1 << 15;
+        
+        if (data & A_Read12Double)
+        {
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+            res.SrcRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+        if (data & A_Write12Double)
+        {
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+            res.DstRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+
+        if (data & A_Link)
+        {
+            res.DstRegs |= 1 << 14;
+            res.SrcRegs |= 1 << 15;
+        }
+
+        if (data & A_LDMSTM)
+        {
+            res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15);
+            if (instr & (1 << 21))
+                res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        }
+
+        res.Kind = (data >> 13) & 0x1FF;
+
+        return res;
+    }
+}
+
+}
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
new file mode 100644
index 0000000..e717664
--- /dev/null
+++ b/src/ARM_InstrInfo.h
@@ -0,0 +1,232 @@
+#ifndef ARMINSTRINFO_H
+#define ARMINSTRINFO_H
+
+#include "types.h"
+
+namespace ARMInstrInfo
+{
+
+// Instruction kinds, for faster dispatch
+
+#define ak_ALU(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_REG_LSL_IMM_S, \
+    ak_##n##_REG_LSR_IMM_S, \
+    ak_##n##_REG_ASR_IMM_S, \
+    ak_##n##_REG_ROR_IMM_S, \
+    \
+    ak_##n##_REG_LSL_REG_S, \
+    ak_##n##_REG_LSR_REG_S, \
+    ak_##n##_REG_ASR_REG_S, \
+    ak_##n##_REG_ROR_REG_S, \
+    \
+    ak_##n##_IMM_S \
+
+#define ak_Test(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM
+
+#define ak_WB_LDRSTR(n) \
+    ak_##n##_REG_LSL, \
+    ak_##n##_REG_LSR, \
+    ak_##n##_REG_ASR, \
+    ak_##n##_REG_ROR, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG_LSL, \
+    ak_##n##_POST_REG_LSR, \
+    ak_##n##_POST_REG_ASR, \
+    ak_##n##_POST_REG_ROR, \
+    \
+    ak_##n##_POST_IMM
+
+#define ak_HD_LDRSTR(n) \
+    ak_##n##_REG, \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG, \
+    ak_##n##_POST_IMM
+
+enum
+{
+    ak_ALU(AND),
+    ak_ALU(EOR),
+    ak_ALU(SUB),
+    ak_ALU(RSB),
+    ak_ALU(ADD),
+    ak_ALU(ADC),
+    ak_ALU(SBC),
+    ak_ALU(RSC),
+    ak_ALU(ORR),
+    ak_ALU(MOV),
+    ak_ALU(BIC),
+    ak_ALU(MVN),
+
+    ak_ALU(TST),
+    ak_ALU(TEQ),
+    ak_ALU(CMP),
+    ak_ALU(CMN),
+
+    ak_MUL,
+    ak_MLA,
+    ak_UMULL,
+    ak_UMLAL,
+    ak_SMULL,
+    ak_SMLAL,
+    ak_SMLAxy,
+    ak_SMLAWy,
+    ak_SMULWy,
+    ak_SMLALxy,
+    ak_SMULxy,
+
+    ak_CLZ,
+
+    ak_QADD,
+    ak_QSUB,
+    ak_QDADD,
+    ak_QDSUB,
+
+    ak_WB_LDRSTR(STR),
+    ak_WB_LDRSTR(STRB),
+    ak_WB_LDRSTR(LDR),
+    ak_WB_LDRSTR(LDRB),
+
+    ak_HD_LDRSTR(STRH),
+    ak_HD_LDRSTR(LDRD),
+    ak_HD_LDRSTR(STRD),
+    ak_HD_LDRSTR(LDRH),
+    ak_HD_LDRSTR(LDRSB),
+    ak_HD_LDRSTR(LDRSH),
+
+    ak_SWP,
+    ak_SWPB,
+
+    ak_LDM,
+    ak_STM,
+
+    ak_B,
+    ak_BL,
+    ak_BLX_IMM,
+    ak_BX,
+    ak_BLX_REG,
+
+    ak_UNK,
+    ak_MSR_IMM,
+    ak_MSR_REG,
+    ak_MRS,
+    ak_MCR,
+    ak_MRC,
+    ak_SVC,
+
+    ak_Count,
+
+    tk_LSL_IMM = 0,
+    tk_LSR_IMM,
+    tk_ASR_IMM,
+
+    tk_ADD_REG_,
+    tk_SUB_REG_,
+    tk_ADD_IMM_,
+    tk_SUB_IMM_,
+
+    tk_MOV_IMM,
+    tk_CMP_IMM,
+    tk_ADD_IMM,
+    tk_SUB_IMM,
+
+    tk_AND_REG,
+    tk_EOR_REG,
+    tk_LSL_REG,
+    tk_LSR_REG,
+    tk_ASR_REG,
+    tk_ADC_REG,
+    tk_SBC_REG,
+    tk_ROR_REG,
+    tk_TST_REG,
+    tk_NEG_REG,
+    tk_CMP_REG,
+    tk_CMN_REG,
+    tk_ORR_REG,
+    tk_MUL_REG,
+    tk_BIC_REG,
+    tk_MVN_REG,
+
+    tk_ADD_HIREG,
+    tk_CMP_HIREG,
+    tk_MOV_HIREG,
+
+    tk_ADD_PCREL,
+    tk_ADD_SPREL,
+    tk_ADD_SP,
+
+    tk_LDR_PCREL,
+    tk_STR_REG,
+    tk_STRB_REG,
+    tk_LDR_REG,
+    tk_LDRB_REG,
+    tk_STRH_REG,
+    tk_LDRSB_REG,
+    tk_LDRH_REG,
+    tk_LDRSH_REG,
+    tk_STR_IMM,
+    tk_LDR_IMM,
+    tk_STRB_IMM,
+    tk_LDRB_IMM,
+    tk_STRH_IMM,
+    tk_LDRH_IMM,
+    tk_STR_SPREL,
+    tk_LDR_SPREL,
+
+    tk_PUSH,
+    tk_POP,
+    tk_LDMIA,
+    tk_STMIA,
+    tk_BCOND,
+    tk_BX,
+    tk_BLX_REG,
+    tk_B,
+    tk_BL_LONG_1,
+    tk_BL_LONG_2,
+    tk_UNK,
+    tk_SVC,
+
+    tk_Count
+};
+
+struct Info
+{
+    u16 DstRegs, SrcRegs;
+    u16 Kind;
+
+    bool Branches()
+    {
+        return DstRegs & (1 << 15);
+    }
+};
+
+Info Decode(bool thumb, u32 num, u32 instr);
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 32fcac2..a6011e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 project(core)
 
+set (CMAKE_CXX_STANDARD 14)
+
 add_library(core STATIC
 	ARCodeList.cpp
 	AREngine.cpp
@@ -9,6 +11,7 @@ add_library(core STATIC
 	ARMInterpreter_ALU.cpp
 	ARMInterpreter_Branch.cpp
 	ARMInterpreter_LoadStore.cpp
+	ARM_InstrInfo.cpp
 	Config.cpp
 	CP15.cpp
 	CRC32.cpp
@@ -46,6 +49,15 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+
+	ARMJIT.cpp
+	ARMJIT_x64/ARMJIT_Compiler.cpp
+
+	dolphin/CommonFuncs.cpp
+	dolphin/x64ABI.cpp
+	dolphin/x64CPUDetect.cpp
+	dolphin/x64Emitter.cpp
+	dolphin/MemoryUtil.cpp
 )
 
 if (WIN32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index d340b9e..3e1c08b 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -21,6 +21,7 @@
 #include "NDS.h"
 #include "DSi.h"
 #include "ARM.h"
+#include "ARMJIT.h"
 
 
 // access timing for cached regions
@@ -812,6 +813,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -833,6 +835,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -854,6 +857,8 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -875,6 +880,8 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 22368ae..2a7edfd 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -32,6 +32,7 @@
 #include "Wifi.h"
 #include "AREngine.h"
 #include "Platform.h"
+#include "ARMJIT.h"
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -168,6 +169,8 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+    ARMJIT::Init();
+
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
     DMAs[2] = new DMA(0, 2);
@@ -200,6 +203,8 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+    ARMJIT::DeInit();
+
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
 
@@ -1971,6 +1976,8 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2021,6 +2028,8 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2087,6 +2096,8 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2381,6 +2392,8 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2440,6 +2453,8 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2509,6 +2524,8 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h
new file mode 100644
index 0000000..4eb16e0
--- /dev/null
+++ b/src/dolphin/Assert.h
@@ -0,0 +1,47 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <assert.h>
+
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h
new file mode 100644
index 0000000..d32b020
--- /dev/null
+++ b/src/dolphin/BitSet.h
@@ -0,0 +1,218 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "../types.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+namespace Common
+{
+template <typename T>
+constexpr int CountSetBits(T v)
+{
+  // from https://graphics.stanford.edu/~seander/bithacks.html
+  // GCC has this built in, but MSVC's intrinsic will only emit the actual
+  // POPCNT instruction, which we're not depending on
+  v = v - ((v >> 1) & (T) ~(T)0 / 3);
+  v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3);
+  v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15;
+  return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8;
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  unsigned long index;
+  _BitScanForward64(&index, val);
+  return (int)index;
+}
+#else
+namespace Common
+{
+constexpr int CountSetBits(u8 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u16 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u32 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u64 val)
+{
+  return __builtin_popcountll(val);
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  return __builtin_ctzll(val);
+}
+#endif
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+  static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+
+public:
+  // A reference to a particular bit, returned from operator[].
+  class Ref
+  {
+  public:
+    constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+    constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+    constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+    bool operator=(bool set)
+    {
+      m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+      return set;
+    }
+
+  private:
+    BitSet* m_bs;
+    IntTy m_mask;
+  };
+
+  // A STL-like iterator is required to be able to use range-based for loops.
+  class Iterator
+  {
+  public:
+    constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+    constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+    Iterator& operator=(Iterator other)
+    {
+      new (this) Iterator(other);
+      return *this;
+    }
+    Iterator& operator++()
+    {
+      if (m_val == 0)
+      {
+        m_bit = -1;
+      }
+      else
+      {
+        int bit = LeastSignificantSetBit(m_val);
+        m_val &= ~(1 << bit);
+        m_bit = bit;
+      }
+      return *this;
+    }
+    Iterator operator++(int)
+    {
+      Iterator other(*this);
+      ++*this;
+      return other;
+    }
+    constexpr int operator*() const { return m_bit; }
+    constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+    constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+
+  private:
+    IntTy m_val;
+    int m_bit;
+  };
+
+  constexpr BitSet() : m_val(0) {}
+  constexpr explicit BitSet(IntTy val) : m_val(val) {}
+  BitSet(std::initializer_list<int> init)
+  {
+    m_val = 0;
+    for (int bit : init)
+      m_val |= (IntTy)1 << bit;
+  }
+
+  constexpr static BitSet AllTrue(size_t count)
+  {
+    return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+  }
+
+  Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+  constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+  constexpr bool operator==(BitSet other) const { return m_val == other.m_val; }
+  constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; }
+  constexpr bool operator<(BitSet other) const { return m_val < other.m_val; }
+  constexpr bool operator>(BitSet other) const { return m_val > other.m_val; }
+  constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+  constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+  constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+  constexpr BitSet operator~() const { return BitSet(~m_val); }
+  constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); }
+  constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); }
+  constexpr explicit operator bool() const { return m_val != 0; }
+  BitSet& operator|=(BitSet other) { return *this = *this | other; }
+  BitSet& operator&=(BitSet other) { return *this = *this & other; }
+  BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+  BitSet& operator<<=(IntTy shift) { return *this = *this << shift; }
+  BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; }
+  // Warning: Even though on modern CPUs this is a single fast instruction,
+  // Dolphin's official builds do not currently assume POPCNT support on x86,
+  // so slower explicit bit twiddling is generated.  Still should generally
+  // be faster than a loop.
+  constexpr unsigned int Count() const { return CountSetBits(m_val); }
+  constexpr Iterator begin() const { return ++Iterator(m_val, 0); }
+  constexpr Iterator end() const { return Iterator(m_val, -1); }
+  IntTy m_val;
+};
+}  // namespace Common
+
+using BitSet8 = Common::BitSet<u8>;
+using BitSet16 = Common::BitSet<u16>;
+using BitSet32 = Common::BitSet<u32>;
+using BitSet64 = Common::BitSet<u64>;
diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h
new file mode 100644
index 0000000..bd4fd8d
--- /dev/null
+++ b/src/dolphin/CPUDetect.h
@@ -0,0 +1,76 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// Detect the CPU, so we'll know which optimizations to use
+#pragma once
+
+#include <string>
+
+enum class CPUVendor
+{
+  Intel,
+  AMD,
+  ARM,
+  Other,
+};
+
+struct CPUInfo
+{
+  CPUVendor vendor = CPUVendor::Intel;
+
+  char cpu_string[0x41] = {};
+  char brand_string[0x21] = {};
+  bool OS64bit = false;
+  bool CPU64bit = false;
+  bool Mode64bit = false;
+
+  bool HTT = false;
+  int num_cores = 0;
+  int logical_cpu_count = 0;
+
+  bool bSSE = false;
+  bool bSSE2 = false;
+  bool bSSE3 = false;
+  bool bSSSE3 = false;
+  bool bPOPCNT = false;
+  bool bSSE4_1 = false;
+  bool bSSE4_2 = false;
+  bool bLZCNT = false;
+  bool bSSE4A = false;
+  bool bAVX = false;
+  bool bAVX2 = false;
+  bool bBMI1 = false;
+  bool bBMI2 = false;
+  bool bFMA = false;
+  bool bFMA4 = false;
+  bool bAES = false;
+  // FXSAVE/FXRSTOR
+  bool bFXSR = false;
+  bool bMOVBE = false;
+  // This flag indicates that the hardware supports some mode
+  // in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+  bool bFlushToZero = false;
+  bool bLAHFSAHF64 = false;
+  bool bLongMode = false;
+  bool bAtom = false;
+
+  // ARMv8 specific
+  bool bFP = false;
+  bool bASIMD = false;
+  bool bCRC32 = false;
+  bool bSHA1 = false;
+  bool bSHA2 = false;
+
+  // Call Detect()
+  explicit CPUInfo();
+
+  // Turn the CPU info into a string we can show
+  std::string Summarize();
+
+private:
+  // Detects the various CPU features
+  void Detect();
+};
+
+extern CPUInfo cpu_info;
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
new file mode 100644
index 0000000..1434297
--- /dev/null
+++ b/src/dolphin/CodeBlock.h
@@ -0,0 +1,121 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "Assert.h"
+#include "../types.h"
+#include "MemoryUtil.h"
+
+namespace Common
+{
+// Everything that needs to generate code should inherit from this.
+// You get memory management for free, plus, you can use all emitter functions without
+// having to prefix them with gen-> or something similar.
+// Example implementation:
+// class JIT : public CodeBlock<ARMXEmitter> {}
+template <class T>
+class CodeBlock : public T
+{
+private:
+  // A privately used function to set the executable RAM space to something invalid.
+  // For debugging usefulness it should be used to set the RAM to a host specific breakpoint
+  // instruction
+  virtual void PoisonMemory() = 0;
+
+protected:
+  u8* region = nullptr;
+  // Size of region we can use.
+  size_t region_size = 0;
+  // Original size of the region we allocated.
+  size_t total_region_size = 0;
+
+  bool m_is_child = false;
+  std::vector<CodeBlock*> m_children;
+
+public:
+  CodeBlock() = default;
+  virtual ~CodeBlock()
+  {
+    if (region)
+      FreeCodeSpace();
+  }
+  CodeBlock(const CodeBlock&) = delete;
+  CodeBlock& operator=(const CodeBlock&) = delete;
+  CodeBlock(CodeBlock&&) = delete;
+  CodeBlock& operator=(CodeBlock&&) = delete;
+
+  // Call this before you generate any code.
+  void AllocCodeSpace(size_t size)
+  {
+    region_size = size;
+    total_region_size = size;
+    region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
+    T::SetCodePtr(region);
+  }
+
+  // Always clear code space with breakpoints, so that if someone accidentally executes
+  // uninitialized, it just breaks into the debugger.
+  void ClearCodeSpace()
+  {
+    PoisonMemory();
+    ResetCodePtr();
+  }
+
+  // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+  void FreeCodeSpace()
+  {
+    ASSERT(!m_is_child);
+    Common::FreeMemoryPages(region, total_region_size);
+    region = nullptr;
+    region_size = 0;
+    total_region_size = 0;
+    for (CodeBlock* child : m_children)
+    {
+      child->region = nullptr;
+      child->region_size = 0;
+      child->total_region_size = 0;
+    }
+  }
+
+  bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
+  // Cannot currently be undone. Will write protect the entire code region.
+  // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+  void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
+  void ResetCodePtr() { T::SetCodePtr(region); }
+  size_t GetSpaceLeft() const
+  {
+    ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
+    return region_size - (T::GetCodePtr() - region);
+  }
+
+  bool IsAlmostFull() const
+  {
+    // This should be bigger than the biggest block ever.
+    return GetSpaceLeft() < 0x10000;
+  }
+
+  bool HasChildren() const { return region_size != total_region_size; }
+  u8* AllocChildCodeSpace(size_t child_size)
+  {
+    ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
+    u8* child_region = region + region_size - child_size;
+    region_size -= child_size;
+    return child_region;
+  }
+  void AddChildCodeSpace(CodeBlock* child, size_t child_size)
+  {
+    u8* child_region = AllocChildCodeSpace(child_size);
+    child->m_is_child = true;
+    child->region = child_region;
+    child->region_size = child_size;
+    child->total_region_size = child_size;
+    child->ResetCodePtr();
+    m_children.emplace_back(child);
+  }
+};
+}  // namespace Common
diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp
new file mode 100644
index 0000000..f85051d
--- /dev/null
+++ b/src/dolphin/CommonFuncs.cpp
@@ -0,0 +1,52 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include <errno.h>
+#include <type_traits>
+
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define strerror_r(err, buf, len) strerror_s(buf, len, err)
+#endif
+
+constexpr size_t BUFFER_SIZE = 256;
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  // There are two variants of strerror_r. The XSI version stores the message to the passed-in
+  // buffer and returns an int (0 on success). The GNU version returns a pointer to the message,
+  // which might have been stored in the passed-in buffer or might be a static string.
+
+  // We check defines in order to figure out variant is in use, and we store the returned value
+  // to a variable so that we'll get a compile-time check that our assumption was correct.
+
+#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600))
+  const char* str = strerror_r(errno, error_message, BUFFER_SIZE);
+  return std::string(str);
+#else
+  int error_code = strerror_r(errno, error_message, BUFFER_SIZE);
+  return error_code == 0 ? std::string(error_message) : "";
+#endif
+}
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(),
+                 MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr);
+  return std::string(error_message);
+}
+#endif
diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h
new file mode 100644
index 0000000..708fbc3
--- /dev/null
+++ b/src/dolphin/CommonFuncs.h
@@ -0,0 +1,58 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "../types.h"
+
+// Will fail to compile on a non-array:
+template <typename T, size_t N>
+constexpr size_t ArraySize(T (&arr)[N])
+{
+  return N;
+}
+
+#ifndef _WIN32
+
+// go to debugger mode
+#define Crash()                                                                                    \
+  {                                                                                                \
+    __builtin_trap();                                                                              \
+  }
+
+#else  // WIN32
+// Function Cross-Compatibility
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define unlink _unlink
+#define vscprintf _vscprintf
+
+// 64 bit offsets for Windows
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define atoll _atoi64
+#define stat _stat64
+#define fstat _fstat64
+#define fileno _fileno
+
+extern "C" {
+__declspec(dllimport) void __stdcall DebugBreak(void);
+}
+#define Crash()                                                                                    \
+  {                                                                                                \
+    DebugBreak();                                                                                  \
+  }
+#endif  // WIN32 ndef
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString();
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString();
+#endif
diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h
new file mode 100644
index 0000000..483f219
--- /dev/null
+++ b/src/dolphin/Intrinsics.h
@@ -0,0 +1,72 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#if defined(_M_X86)
+
+/**
+ * It is assumed that all compilers used to build Dolphin support intrinsics up to and including
+ * SSE 4.2 on x86/x64.
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+
+/**
+ * Due to limitations in GCC, SSE intrinsics are only available when compiling with the
+ * corresponding instruction set enabled. However, using the target attribute, we can compile
+ * single functions with a different target instruction set, while still creating a generic build.
+ *
+ * Since this instruction set is enabled per-function, any callers should verify that the
+ * instruction set is supported at runtime before calling it, and provide a fallback implementation
+ * when not supported.
+ *
+ * When building with -march=native, or enabling the instruction sets in the compile flags, permit
+ * usage of the instrinsics without any function attributes. If the command-line architecture does
+ * not support this instruction set, enable it via function targeting.
+ */
+
+#include <x86intrin.h>
+#ifndef __SSE4_2__
+#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]]
+#endif
+#ifndef __SSE4_1__
+#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]]
+#endif
+#ifndef __SSSE3__
+#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]]
+#endif
+#ifndef __SSE3__
+#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]]
+#endif
+
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+/**
+ * MSVC and ICC support intrinsics for any instruction set without any function attributes.
+ */
+#include <intrin.h>
+
+#endif  // defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+#endif  // _M_X86
+
+/**
+ * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform.
+ * This way when a function is defined with FUNCTION_TARGET you don't need to define a second
+ * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use
+ * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures.
+ */
+#ifndef FUNCTION_TARGET_SSE42
+#define FUNCTION_TARGET_SSE42
+#endif
+#ifndef FUNCTION_TARGET_SSR41
+#define FUNCTION_TARGET_SSR41
+#endif
+#ifndef FUNCTION_TARGET_SSSE3
+#define FUNCTION_TARGET_SSSE3
+#endif
+#ifndef FUNCTION_TARGET_SSE3
+#define FUNCTION_TARGET_SSE3
+#endif
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
new file mode 100644
index 0000000..21e69a5
--- /dev/null
+++ b/src/dolphin/Log.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "CommonFuncs.h"
+
+#include <stdio.h>
+
+#define PanicAlert(msg) \
+    do \
+    { \
+        printf("%s\n", msg); \
+        Crash(); \
+    } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
new file mode 100644
index 0000000..01cb897
--- /dev/null
+++ b/src/dolphin/MemoryUtil.cpp
@@ -0,0 +1,193 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#include "../types.h"
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+//#include "Common/StringUtil.h"
+#else
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#if defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+#include <sys/sysctl.h>
+#elif defined __HAIKU__
+#include <OS.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+namespace Common
+{
+// This is purposely not a full wrapper for virtualalloc/mmap, but it
+// provides exactly the primitive operations that Dolphin needs.
+
+void* AllocateExecutableMemory(size_t size)
+{
+  printf("c\n");
+
+#if defined(_WIN32)
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+  void* ptr =
+      mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+  printf("a\n");
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate executable memory");
+
+  printf("b\n");
+
+  return ptr;
+}
+
+void* AllocateMemoryPages(size_t size)
+{
+#ifdef _WIN32
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate raw memory");
+
+  return ptr;
+}
+
+void* AllocateAlignedMemory(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+  void* ptr = _aligned_malloc(size, alignment);
+#else
+  void* ptr = nullptr;
+  if (posix_memalign(&ptr, alignment, size) != 0)
+    ERROR_LOG(MEMMAP, "Failed to allocate aligned memory");
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate aligned memory");
+
+  return ptr;
+}
+
+void FreeMemoryPages(void* ptr, size_t size)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    if (!VirtualFree(ptr, 0, MEM_RELEASE))
+      PanicAlert("FreeMemoryPages failed!\nVirtualFree: %s", GetLastErrorString().c_str());
+#else
+    if (munmap(ptr, size) != 0)
+      PanicAlert("FreeMemoryPages failed!\nmunmap: %s", LastStrerrorString().c_str());
+#endif
+  }
+}
+
+void FreeAlignedMemory(void* ptr)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+  }
+}
+
+void ReadProtectMemory(void* ptr, size_t size)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue))
+    PanicAlert("ReadProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, PROT_NONE) != 0)
+    PanicAlert("ReadProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, &oldValue))
+    PanicAlert("WriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, allowExecute ? (PROT_READ | PROT_EXEC) : PROT_READ) != 0)
+    PanicAlert("WriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldValue))
+    PanicAlert("UnWriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size,
+               allowExecute ? (PROT_READ | PROT_WRITE | PROT_EXEC) : PROT_WRITE | PROT_READ) != 0)
+  {
+    PanicAlert("UnWriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+  }
+#endif
+}
+
+size_t MemPhysical()
+{
+#ifdef _WIN32
+  MEMORYSTATUSEX memInfo;
+  memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+  GlobalMemoryStatusEx(&memInfo);
+  return memInfo.ullTotalPhys;
+#elif defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+  int mib[2];
+  size_t physical_memory;
+  mib[0] = CTL_HW;
+#ifdef __APPLE__
+  mib[1] = HW_MEMSIZE;
+#elif defined __FreeBSD__
+  mib[1] = HW_REALMEM;
+#elif defined __OpenBSD__
+  mib[1] = HW_PHYSMEM;
+#endif
+  size_t length = sizeof(size_t);
+  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+  return physical_memory;
+#elif defined __HAIKU__
+  system_info sysinfo;
+  get_system_info(&sysinfo);
+  return static_cast<size_t>(sysinfo.max_pages * B_PAGE_SIZE);
+#else
+  struct sysinfo memInfo;
+  sysinfo(&memInfo);
+  return (size_t)memInfo.totalram * memInfo.mem_unit;
+#endif
+}
+
+}  // namespace Common
diff --git a/src/dolphin/MemoryUtil.h b/src/dolphin/MemoryUtil.h
new file mode 100644
index 0000000..607b7a8
--- /dev/null
+++ b/src/dolphin/MemoryUtil.h
@@ -0,0 +1,22 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+namespace Common
+{
+void* AllocateExecutableMemory(size_t size);
+void* AllocateMemoryPages(size_t size);
+void FreeMemoryPages(void* ptr, size_t size);
+void* AllocateAlignedMemory(size_t size, size_t alignment);
+void FreeAlignedMemory(void* ptr);
+void ReadProtectMemory(void* ptr, size_t size);
+void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false);
+size_t MemPhysical();
+
+}  // namespace Common
diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/src/dolphin/license_dolphin.txt
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp
new file mode 100644
index 0000000..d86a158
--- /dev/null
+++ b/src/dolphin/x64ABI.cpp
@@ -0,0 +1,119 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include "../types.h"
+#include "x64ABI.h"
+#include "x64Emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                      size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+  size_t shadow = 0;
+#if defined(_WIN32)
+  shadow = 0x20;
+#endif
+
+  int count = (mask & ABI_ALL_GPRS).Count();
+  rsp_alignment -= count * 8;
+  size_t subtraction = 0;
+  int fpr_count = (mask & ABI_ALL_FPRS).Count();
+  if (fpr_count)
+  {
+    // If we have any XMMs to save, we must align the stack here.
+    subtraction = rsp_alignment & 0xf;
+  }
+  subtraction += 16 * fpr_count;
+  size_t xmm_base_subtraction = subtraction;
+  subtraction += needed_frame_size;
+  subtraction += shadow;
+  // Final alignment.
+  rsp_alignment -= subtraction;
+  subtraction += rsp_alignment & 0xf;
+
+  *shadowp = shadow;
+  *subtractionp = subtraction;
+  *xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                                 size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int r : mask& ABI_ALL_GPRS)
+    PUSH((X64Reg)r);
+
+  if (subtraction)
+    SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
+    xmm_offset += 16;
+  }
+
+  return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                              size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset));
+    xmm_offset += 16;
+  }
+
+  if (subtraction)
+    ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int r = 15; r >= 0; r--)
+  {
+    if (mask[r])
+      POP((X64Reg)r);
+  }
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2,
+                      Gen::X64Reg src2)
+{
+  if (dst1 == src2 && dst2 == src1)
+  {
+    XCHG(bits, R(src1), R(src2));
+    if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+  else if (src2 != dst1)
+  {
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+  }
+  else
+  {
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+}
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
new file mode 100644
index 0000000..997782e
--- /dev/null
+++ b/src/dolphin/x64ABI.h
@@ -0,0 +1,57 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include "BitSet.h"
+#include "x64Reg.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself
+// calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch:      RAX RCX RDX R8 R9 R10 R11
+// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters:   RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save:  RBX RBP R12 R13 R14 R15
+// Parameters:   RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32  // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED                                                                       \
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11})
+#else  // 64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS)
+#endif  // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
new file mode 100644
index 0000000..05ee11c
--- /dev/null
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -0,0 +1,274 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstring>
+#include <string>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Intrinsics.h"
+
+#ifndef _MSVC_VER
+
+#ifdef __FreeBSD__
+#include <unistd.h>
+
+#include <machine/cpufunc.h>
+#include <sys/types.h>
+#endif
+
+static inline void __cpuidex(int info[4], int function_id, int subfunction_id)
+{
+#ifdef __FreeBSD__
+  // Despite the name, this is just do_cpuid() with ECX as second input.
+  cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
+#else
+  info[0] = function_id;     // eax
+  info[2] = subfunction_id;  // ecx
+  __asm__("cpuid"
+          : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+          : "a"(function_id), "c"(subfunction_id));
+#endif
+}
+
+static inline void __cpuid(int info[4], int function_id)
+{
+  return __cpuidex(info, function_id, 0);
+}
+
+#endif  // ifndef _WIN32
+
+#ifdef _MSVC_VER
+
+static u64 xgetbv(u32 index)
+{
+  return _xgetbv(index);
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK;
+
+#else
+
+static u64 xgetbv(u32 index)
+{
+  u32 eax, edx;
+  __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+  return ((u64)edx << 32) | eax;
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0;
+#endif  // ifdef _WIN32
+
+CPUInfo cpu_info;
+
+CPUInfo::CPUInfo()
+{
+  Detect();
+}
+
+// Detects the various CPU features
+void CPUInfo::Detect()
+{
+#ifdef _M_X86_64
+  Mode64bit = true;
+  OS64bit = true;
+#endif
+  num_cores = 1;
+
+  // Set obvious defaults, for extra safety
+  if (Mode64bit)
+  {
+    bSSE = true;
+    bSSE2 = true;
+    bLongMode = true;
+  }
+
+  // Assume CPU supports the CPUID instruction. Those that don't can barely
+  // boot modern OS:es anyway.
+  int cpu_id[4];
+
+  // Detect CPU's CPUID capabilities, and grab CPU string
+  __cpuid(cpu_id, 0x00000000);
+  u32 max_std_fn = cpu_id[0];  // EAX
+  std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int));
+  std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int));
+  std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int));
+  __cpuid(cpu_id, 0x80000000);
+  u32 max_ex_fn = cpu_id[0];
+  if (!strcmp(brand_string, "GenuineIntel"))
+    vendor = CPUVendor::Intel;
+  else if (!strcmp(brand_string, "AuthenticAMD"))
+    vendor = CPUVendor::AMD;
+  else
+    vendor = CPUVendor::Other;
+
+  // Set reasonable default brand string even if brand string not available.
+  strcpy(cpu_string, brand_string);
+
+  // Detect family and other misc stuff.
+  bool ht = false;
+  HTT = ht;
+  logical_cpu_count = 1;
+  if (max_std_fn >= 1)
+  {
+    __cpuid(cpu_id, 0x00000001);
+    int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+    int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+    // Detect people unfortunate enough to be running Dolphin on an Atom
+    if (family == 6 &&
+        (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
+         model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+      bAtom = true;
+    logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
+    ht = (cpu_id[3] >> 28) & 1;
+
+    if ((cpu_id[3] >> 25) & 1)
+      bSSE = true;
+    if ((cpu_id[3] >> 26) & 1)
+      bSSE2 = true;
+    if ((cpu_id[2]) & 1)
+      bSSE3 = true;
+    if ((cpu_id[2] >> 9) & 1)
+      bSSSE3 = true;
+    if ((cpu_id[2] >> 19) & 1)
+      bSSE4_1 = true;
+    if ((cpu_id[2] >> 20) & 1)
+      bSSE4_2 = true;
+    if ((cpu_id[2] >> 22) & 1)
+      bMOVBE = true;
+    if ((cpu_id[2] >> 25) & 1)
+      bAES = true;
+
+    if ((cpu_id[3] >> 24) & 1)
+    {
+      // We can use FXSAVE.
+      bFXSR = true;
+    }
+
+    // AVX support requires 3 separate checks:
+    //  - Is the AVX bit set in CPUID?
+    //  - Is the XSAVE bit set in CPUID?
+    //  - XGETBV result has the XCR bit set.
+    if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+    {
+      if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+      {
+        bAVX = true;
+        if ((cpu_id[2] >> 12) & 1)
+          bFMA = true;
+      }
+    }
+
+    if (max_std_fn >= 7)
+    {
+      __cpuidex(cpu_id, 0x00000007, 0x00000000);
+      // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+      if ((cpu_id[1] >> 5) & 1)
+        bAVX2 = bAVX;
+      if ((cpu_id[1] >> 3) & 1)
+        bBMI1 = true;
+      if ((cpu_id[1] >> 8) & 1)
+        bBMI2 = true;
+    }
+  }
+
+  bFlushToZero = bSSE;
+
+  if (max_ex_fn >= 0x80000004)
+  {
+    // Extract CPU model string
+    __cpuid(cpu_id, 0x80000002);
+    memcpy(cpu_string, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000003);
+    memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000004);
+    memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id));
+  }
+  if (max_ex_fn >= 0x80000001)
+  {
+    // Check for more features.
+    __cpuid(cpu_id, 0x80000001);
+    if (cpu_id[2] & 1)
+      bLAHFSAHF64 = true;
+    if ((cpu_id[2] >> 5) & 1)
+      bLZCNT = true;
+    if ((cpu_id[2] >> 16) & 1)
+      bFMA4 = true;
+    if ((cpu_id[3] >> 29) & 1)
+      bLongMode = true;
+  }
+
+  num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count;
+
+  if (max_ex_fn >= 0x80000008)
+  {
+    // Get number of cores. This is a bit complicated. Following AMD manual here.
+    __cpuid(cpu_id, 0x80000008);
+    int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF;
+    if (apic_id_core_id_size == 0)
+    {
+      if (ht)
+      {
+        // New mechanism for modern Intel CPUs.
+        if (vendor == CPUVendor::Intel)
+        {
+          __cpuidex(cpu_id, 0x00000004, 0x00000000);
+          int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1;
+          HTT = (cores_x_package < logical_cpu_count);
+          cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1;
+          num_cores = (cores_x_package > 1) ? cores_x_package : num_cores;
+          logical_cpu_count /= cores_x_package;
+        }
+      }
+    }
+    else
+    {
+      // Use AMD's new method.
+      num_cores = (cpu_id[2] & 0xFF) + 1;
+    }
+  }
+}
+
+// Turn the CPU info into a string we can show
+std::string CPUInfo::Summarize()
+{
+  std::string sum(cpu_string);
+  sum += " (";
+  sum += brand_string;
+  sum += ")";
+
+  if (bSSE)
+    sum += ", SSE";
+  if (bSSE2)
+  {
+    sum += ", SSE2";
+    if (!bFlushToZero)
+      sum += " (but not DAZ!)";
+  }
+  if (bSSE3)
+    sum += ", SSE3";
+  if (bSSSE3)
+    sum += ", SSSE3";
+  if (bSSE4_1)
+    sum += ", SSE4.1";
+  if (bSSE4_2)
+    sum += ", SSE4.2";
+  if (HTT)
+    sum += ", HTT";
+  if (bAVX)
+    sum += ", AVX";
+  if (bAVX2)
+    sum += ", AVX2";
+  if (bBMI1)
+    sum += ", BMI1";
+  if (bBMI2)
+    sum += ", BMI2";
+  if (bFMA)
+    sum += ", FMA";
+  if (bAES)
+    sum += ", AES";
+  if (bMOVBE)
+    sum += ", MOVBE";
+  if (bLongMode)
+    sum += ", 64-bit support";
+  return sum;
+}
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
new file mode 100644
index 0000000..7849624
--- /dev/null
+++ b/src/dolphin/x64Emitter.cpp
@@ -0,0 +1,3398 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Log.h"
+#include "x64Emitter.h"
+#include "x64Reg.h"
+
+namespace Gen
+{
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+  u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] = {
+    {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0},  // ADD
+    {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2},  // ADC
+
+    {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5},  // SUB
+    {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3},  // SBB
+
+    {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4},  // AND
+    {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1},  // OR
+
+    {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6},  // XOR
+    {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0},  // MOV
+
+    {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0},  // TEST (to == from)
+    {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7},  // CMP
+
+    {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7},  // XCHG
+};
+
+enum NormalSSEOps
+{
+  sseCMP = 0xC2,
+  sseADD = 0x58,   // ADD
+  sseSUB = 0x5C,   // SUB
+  sseAND = 0x54,   // AND
+  sseANDN = 0x55,  // ANDN
+  sseOR = 0x56,
+  sseXOR = 0x57,
+  sseMUL = 0x59,          // MUL
+  sseDIV = 0x5E,          // DIV
+  sseMIN = 0x5D,          // MIN
+  sseMAX = 0x5F,          // MAX
+  sseCOMIS = 0x2F,        // COMIS
+  sseUCOMIS = 0x2E,       // UCOMIS
+  sseSQRT = 0x51,         // SQRT
+  sseRCP = 0x53,          // RCP
+  sseRSQRT = 0x52,        // RSQRT (NO DOUBLE PRECISION!!!)
+  sseMOVAPfromRM = 0x28,  // MOVAP from RM
+  sseMOVAPtoRM = 0x29,    // MOVAP to RM
+  sseMOVUPfromRM = 0x10,  // MOVUP from RM
+  sseMOVUPtoRM = 0x11,    // MOVUP to RM
+  sseMOVLPfromRM = 0x12,
+  sseMOVLPtoRM = 0x13,
+  sseMOVHPfromRM = 0x16,
+  sseMOVHPtoRM = 0x17,
+  sseMOVHLPS = 0x12,
+  sseMOVLHPS = 0x16,
+  sseMOVDQfromRM = 0x6F,
+  sseMOVDQtoRM = 0x7F,
+  sseMASKMOVDQU = 0xF7,
+  sseLDDQU = 0xF0,
+  sseSHUF = 0xC6,
+  sseMOVNTDQ = 0xE7,
+  sseMOVNTP = 0x2B,
+};
+
+enum class NormalOp
+{
+  ADD,
+  ADC,
+  SUB,
+  SBB,
+  AND,
+  OR,
+  XOR,
+  MOV,
+  TEST,
+  CMP,
+  XCHG,
+};
+
+enum class FloatOp
+{
+  LD = 0,
+  ST = 2,
+  STP = 3,
+  LD80 = 5,
+  STP80 = 7,
+
+  Invalid = -1,
+};
+
+void XEmitter::SetCodePtr(u8* ptr)
+{
+  code = ptr;
+}
+
+const u8* XEmitter::GetCodePtr() const
+{
+  return code;
+}
+
+u8* XEmitter::GetWritableCodePtr()
+{
+  return code;
+}
+
+void XEmitter::Write8(u8 value)
+{
+  *code++ = value;
+}
+
+void XEmitter::Write16(u16 value)
+{
+  std::memcpy(code, &value, sizeof(u16));
+  code += sizeof(u16);
+}
+
+void XEmitter::Write32(u32 value)
+{
+  std::memcpy(code, &value, sizeof(u32));
+  code += sizeof(u32);
+}
+
+void XEmitter::Write64(u64 value)
+{
+  std::memcpy(code, &value, sizeof(u64));
+  code += sizeof(u64);
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+  for (int i = 0; i < bytes; i++)
+    *code++ = 0xCC;
+}
+
+u8* XEmitter::AlignCodeTo(size_t alignment)
+{
+  ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0,
+             "Alignment must be power of two");
+  u64 c = reinterpret_cast<u64>(code) & (alignment - 1);
+  if (c)
+    ReserveCodeSpace(static_cast<int>(alignment - c));
+  return code;
+}
+
+u8* XEmitter::AlignCode4()
+{
+  return AlignCodeTo(4);
+}
+
+u8* XEmitter::AlignCode16()
+{
+  return AlignCodeTo(16);
+}
+
+u8* XEmitter::AlignCodePage()
+{
+  return AlignCodeTo(4096);
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+  ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+  Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+  Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const
+{
+  if (customOp == -1)
+    customOp = operandReg;
+  u8 op = 0x40;
+  // REX.W (whether operation is a 64-bit operation)
+  if (opBits == 64)
+    op |= 8;
+  // REX.R (whether ModR/M reg field refers to R8-R15.
+  if (customOp & 8)
+    op |= 4;
+  // REX.X (whether ModR/M SIB index field refers to R8-R15)
+  if (indexReg & 8)
+    op |= 2;
+  // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+  if (offsetOrBaseReg & 8)
+    op |= 1;
+  // Write REX if wr have REX bits to write, or if the operation accesses
+  // SIL, DIL, BPL, or SPL.
+  if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+      (opBits == 8 && (customOp & 0x10c) == 4))
+  {
+    emit->Write8(op);
+    // Check the operation doesn't access AH, BH, CH, or DH.
+    DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
+    DEBUG_ASSERT((customOp & 0x100) == 0);
+  }
+}
+
+void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                     int W) const
+{
+  int R = !(regOp1 & 8);
+  int X = !(indexReg & 8);
+  int B = !(offsetOrBaseReg & 8);
+
+  int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+  // do we need any VEX fields that only appear in the three-byte form?
+  if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+  {
+    u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC5);
+    emit->Write8(RvvvvLpp);
+  }
+  else
+  {
+    u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+    u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC4);
+    emit->Write8(RXBmmmmm);
+    emit->Write8(WvvvvLpp);
+  }
+}
+
+void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
+                      bool warn_64bit_offset) const
+{
+  if (_operandReg == INVALID_REG)
+    _operandReg = (X64Reg)this->operandReg;
+  int mod = 0;
+  int ireg = indexReg;
+  bool SIB = false;
+  int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+  if (scale == SCALE_RIP)  // Also, on 32-bit, just an immediate address
+  {
+    // Oh, RIP addressing.
+    _offsetOrBaseReg = 5;
+    emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+    // TODO : add some checks
+    u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+    s64 distance = (s64)offset - (s64)ripAddr;
+    ASSERT_MSG(DYNA_REC,
+               (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset,
+               "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset);
+    s32 offs = (s32)distance;
+    emit->Write32((u32)offs);
+    return;
+  }
+
+  if (scale == 0)
+  {
+    // Oh, no memory, Just a reg.
+    mod = 3;  // 11
+  }
+  else
+  {
+    // Ah good, no scaling.
+    if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+    {
+      // Okay, we're good. No SIB necessary.
+      int ioff = (int)offset;
+      if (ioff == 0)
+      {
+        mod = 0;
+      }
+      else if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+    else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+    {
+      SIB = true;
+      mod = 0;
+      _offsetOrBaseReg = 5;
+    }
+    else
+    {
+      if ((_offsetOrBaseReg & 7) == 4)  // this would occupy the SIB encoding :(
+      {
+        // So we have to fake it with SIB encoding :(
+        SIB = true;
+      }
+
+      if (scale >= SCALE_1 && scale < SCALE_ATREG)
+      {
+        SIB = true;
+      }
+
+      if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+      {
+        SIB = true;
+        ireg = _offsetOrBaseReg;
+      }
+
+      // Okay, we're fine. Just disp encoding.
+      // We need displacement. Which size?
+      int ioff = (int)(s64)offset;
+      if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+  }
+
+  // Okay. Time to do the actual writing
+  // ModRM byte:
+  int oreg = _offsetOrBaseReg;
+  if (SIB)
+    oreg = 4;
+
+  emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
+
+  if (SIB)
+  {
+    // SIB byte
+    int ss;
+    switch (scale)
+    {
+    case SCALE_NONE:
+      _offsetOrBaseReg = 4;
+      ss = 0;
+      break;  // RSP
+    case SCALE_1:
+      ss = 0;
+      break;
+    case SCALE_2:
+      ss = 1;
+      break;
+    case SCALE_4:
+      ss = 2;
+      break;
+    case SCALE_8:
+      ss = 3;
+      break;
+    case SCALE_NOBASE_2:
+      ss = 1;
+      break;
+    case SCALE_NOBASE_4:
+      ss = 2;
+      break;
+    case SCALE_NOBASE_8:
+      ss = 3;
+      break;
+    case SCALE_ATREG:
+      ss = 0;
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte");
+      ss = 0;
+      break;
+    }
+    emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7)));
+  }
+
+  if (mod == 1)  // 8-bit disp
+  {
+    emit->Write8((u8)(s8)(s32)offset);
+  }
+  else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8))  // 32-bit disp
+  {
+    emit->Write32((u32)offset);
+  }
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+  w = w ? 1 : 0;
+  r = r ? 1 : 0;
+  x = x ? 1 : 0;
+  b = b ? 1 : 0;
+  u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+  if (rx != 0x40)
+    Write8(rx);
+}
+
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
+{
+  u64 fn = (u64)addr;
+  if (!force5Bytes)
+  {
+    s64 distance = (s64)(fn - ((u64)code + 2));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    // 8 bits will do
+    Write8(0xEB);
+    Write8((u8)(s8)distance);
+  }
+  else
+  {
+    s64 distance = (s64)(fn - ((u64)code + 5));
+
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0xE9);
+    Write32((u32)(s32)distance);
+  }
+}
+
+void XEmitter::JMPptr(const OpArg& arg2)
+{
+  OpArg arg = arg2;
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument");
+  arg.operandReg = 4;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+// Can be used to trap other processors, before overwriting their code
+// not used in Dolphin
+void XEmitter::JMPself()
+{
+  Write8(0xEB);
+  Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument");
+  arg.operandReg = 2;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void* fnptr)
+{
+  u64 distance = u64(fnptr) - (u64(code) + 5);
+  ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL,
+             "CALL out of range (%p calls %p)", code, fnptr);
+  Write8(0xE8);
+  Write32(u32(distance));
+}
+
+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = FixupBranch::Type::Branch32Bit;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 5 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0xEB);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0xE9);
+    Write32(0);
+  }
+  return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 6 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0x70 + conditionCode);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32(0);
+  }
+  return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+  u64 fn = (u64)addr;
+  s64 distance = (s64)(fn - ((u64)code + 2));
+  if (distance < -0x80 || distance >= 0x80)
+  {
+    distance = (s64)(fn - ((u64)code + 6));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32((u32)(s32)distance);
+  }
+  else
+  {
+    Write8(0x70 + conditionCode);
+    Write8((u8)(s8)distance);
+  }
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
+{
+  if (branch.type == FixupBranch::Type::Branch8Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    if (!(distance >= -0x80 && distance < 0x80))
+    {
+      printf("miauz\n");
+    }
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    branch.ptr[-1] = (u8)(s8)distance;
+  }
+  else if (branch.type == FixupBranch::Type::Branch32Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+
+    s32 valid_distance = static_cast<s32>(distance);
+    std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32));
+  }
+}
+
+// Single byte opcodes
+// There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3()
+{
+  Write8(0xCC);
+}
+void XEmitter::RET()
+{
+  Write8(0xC3);
+}
+void XEmitter::RET_FAST()
+{
+  Write8(0xF3);
+  Write8(0xC3);
+}  // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to
+   // a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+  DEBUG_ASSERT((int)size > 0);
+  while (true)
+  {
+    switch (size)
+    {
+    case 0:
+      return;
+    case 1:
+      Write8(0x90);
+      return;
+    case 2:
+      Write8(0x66);
+      Write8(0x90);
+      return;
+    case 3:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x00);
+      return;
+    case 4:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x40);
+      Write8(0x00);
+      return;
+    case 5:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 6:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 7:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x80);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 8:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 9:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 10:
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    default:
+      // Even though x86 instructions are allowed to be up to 15 bytes long,
+      // AMD advises against using NOPs longer than 11 bytes because they
+      // carry a performance penalty on CPUs older than AMD family 16h.
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      size -= 11;
+      continue;
+    }
+  }
+}
+
+void XEmitter::PAUSE()
+{
+  Write8(0xF3);
+  NOP();
+}  // use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()
+{
+  CheckFlags();
+  Write8(0xF8);
+}  // clear carry
+void XEmitter::CMC()
+{
+  CheckFlags();
+  Write8(0xF5);
+}  // flip carry
+void XEmitter::STC()
+{
+  CheckFlags();
+  Write8(0xF9);
+}  // set carry
+
+// TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+  Write8(0x86);
+  Write8(0xe0);
+  // alt. 86 c4
+}
+
+// These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF()
+{
+  Write8(0x9F);
+}
+void XEmitter::SAHF()
+{
+  CheckFlags();
+  Write8(0x9E);
+}
+
+void XEmitter::PUSHF()
+{
+  Write8(0x9C);
+}
+void XEmitter::POPF()
+{
+  CheckFlags();
+  Write8(0x9D);
+}
+
+void XEmitter::LFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xE8);
+}
+void XEmitter::MFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF0);
+}
+void XEmitter::SFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF8);
+}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte1);
+  Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, 0);
+  Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+  if (bits == 8)
+    Write8(0x66);
+  Rex(bits == 32, 0, 0, 0);
+  Write8(0x98);
+}
+
+// Simple opcodes
+
+// push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x50, reg);
+}
+void XEmitter::POP(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x58, reg);
+}
+
+void XEmitter::PUSH(int bits, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    PUSH(reg.GetSimpleReg());
+  else if (reg.IsImm())
+  {
+    switch (reg.GetImmBits())
+    {
+    case 8:
+      Write8(0x6A);
+      Write8((u8)(s8)reg.offset);
+      break;
+    case 16:
+      Write8(0x66);
+      Write8(0x68);
+      Write16((u16)(s16)(s32)reg.offset);
+      break;
+    case 32:
+      Write8(0x68);
+      Write32((u32)reg.offset);
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits");
+      break;
+    }
+  }
+  else
+  {
+    if (bits == 16)
+      Write8(0x66);
+    reg.WriteREX(this, bits, bits);
+    Write8(0xFF);
+    reg.WriteRest(this, 0, (X64Reg)6);
+  }
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    POP(reg.GetSimpleReg());
+  else
+    ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+  if (bits >= 32)
+  {
+    WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+  }
+  else if (bits == 16)
+  {
+    ROL(16, R(reg), Imm8(8));
+  }
+  else if (bits == 8)
+  {
+    // Do nothing - can't bswap a single byte...
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+  }
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+  Write8(0x0F);
+  Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+  ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+  arg.operandReg = (u8)level;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0x18);
+  arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+  ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+  dest.operandReg = 0;
+  dest.WriteREX(this, 0, 8);
+  Write8(0x0F);
+  Write8(0x90 + (u8)flag);
+  dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+  ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+  if (bits == 16)
+    Write8(0x66);
+  src.operandReg = dest;
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(0x40 + (u8)flag);
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+  CheckFlags();
+  src.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  src.WriteREX(this, bits, bits, 0);
+  if (bits == 8)
+  {
+    Write8(0xF6);
+  }
+  else
+  {
+    Write8(0xF7);
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 4);
+}
+void XEmitter::DIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 6);
+}
+void XEmitter::IMUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 5);
+}
+void XEmitter::IDIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 7);
+}
+void XEmitter::NEG(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 3);
+}
+void XEmitter::NOT(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 2);
+}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+  CheckFlags();
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);
+  if (rep)
+    Write8(0xF3);
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(byte2);
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
+{
+  if (bits <= 16)
+    ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16");
+  WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBC);
+}  // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBD);
+}  // Top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bLZCNT)
+    PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  src.WriteREX(this, dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xBE);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xBF);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x63);
+  }
+  else
+  {
+    Crash();
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  // the 32bit result is automatically zero extended to 64bit
+  src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xB6);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xB7);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x8B);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size");
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
+{
+  ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+  if (bits == 8)
+  {
+    MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg);
+    return;
+  }
+  if (bits == 16)
+    Write8(0x66);
+  ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!");
+  arg.WriteREX(this, bits, bits, reg);
+  Write8(0x0F);
+  Write8(0x38);
+  Write8(op);
+  arg.WriteRest(this, 0, reg);
+}
+void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteMOVBE(bits, 0xF0, dest, src);
+}
+void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src)
+{
+  WriteMOVBE(bits, 0xF1, src, dest);
+}
+
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info)
+{
+  if (info)
+  {
+    info->address = GetWritableCodePtr();
+    info->nonAtomicSwapStore = false;
+  }
+
+  switch (size)
+  {
+  case 8:
+    if (sign_extend)
+      MOVSX(32, 8, dst, src);
+    else
+      MOVZX(32, 8, dst, src);
+    break;
+  case 16:
+    MOVZX(32, 16, dst, src);
+    if (sign_extend)
+    {
+      BSWAP(32, dst);
+      SAR(32, R(dst), Imm8(16));
+    }
+    else
+    {
+      ROL(16, R(dst), Imm8(8));
+    }
+    break;
+  case 32:
+  case 64:
+    if (cpu_info.bMOVBE)
+    {
+      MOVBE(size, dst, src);
+    }
+    else
+    {
+      MOV(size, R(dst), src);
+      BSWAP(size, dst);
+    }
+    break;
+  }
+}
+
+void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info)
+{
+  if (cpu_info.bMOVBE)
+  {
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = false;
+    }
+    MOVBE(size, dst, src);
+  }
+  else
+  {
+    BSWAP(size, src);
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = true;
+      info->nonAtomicSwapStoreSrc = src;
+    }
+    MOV(size, dst, R(src));
+  }
+}
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);  // TODO: performance warning
+  src.WriteREX(this, bits, bits);
+  Write8(0x8D);
+  src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
+{
+  CheckFlags();
+  bool writeImm = false;
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument");
+  }
+  dest.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  dest.WriteREX(this, bits, bits, 0);
+  if (shift.GetImmBits() == 8)
+  {
+    // ok an imm
+    u8 imm = (u8)shift.offset;
+    if (imm == 1)
+    {
+      Write8(bits == 8 ? 0xD0 : 0xD1);
+    }
+    else
+    {
+      writeImm = true;
+      Write8(bits == 8 ? 0xC0 : 0xC1);
+    }
+  }
+  else
+  {
+    Write8(bits == 8 ? 0xD2 : 0xD3);
+  }
+  dest.WriteRest(this, writeImm ? 1 : 0);
+  if (writeImm)
+    Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on Intel than AMD
+// Intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 0);
+}
+void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 1);
+}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 2);
+}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 3);
+}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 4);
+}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 5);
+}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 7);
+}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms");
+  }
+  if ((index.IsImm() && index.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  if (index.IsImm())
+  {
+    dest.WriteREX(this, bits, bits);
+    Write8(0x0F);
+    Write8(0xBA);
+    dest.WriteRest(this, 1, (X64Reg)ext);
+    Write8((u8)index.offset);
+  }
+  else
+  {
+    X64Reg operand = index.GetSimpleReg();
+    dest.WriteREX(this, bits, bits, operand);
+    Write8(0x0F);
+    Write8(0x83 + 8 * ext);
+    dest.WriteRest(this, 1, operand);
+  }
+}
+
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 4);
+}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 5);
+}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 6);
+}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 7);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xAC);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xAD);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xA4);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xA5);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits)
+{
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  this->operandReg = (u8)_operandReg;
+  WriteREX(emit, bits, bits);
+  emit->Write8(op);
+  WriteRest(emit);
+}
+
+// operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+                          int bits) const
+{
+  X64Reg _operandReg;
+  if (IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+  }
+
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  int immToWrite = 0;
+  const NormalOpDef& op_def = normalops[static_cast<int>(op)];
+
+  if (operand.IsImm())
+  {
+    WriteREX(emit, bits, bits);
+
+    if (!toRM)
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+    }
+
+    if (operand.scale == SCALE_IMM8 && bits == 8)
+    {
+      // op al, imm8
+      if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC)
+      {
+        emit->Write8(op_def.eaximm8);
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // mov reg, imm8
+      if (!scale && op == NormalOp::MOV)
+      {
+        emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // op r/m8, imm8
+      emit->Write8(op_def.imm8);
+      immToWrite = 8;
+    }
+    else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+             (operand.scale == SCALE_IMM32 && bits == 32) ||
+             (operand.scale == SCALE_IMM32 && bits == 64))
+    {
+      // Try to save immediate size if we can, but first check to see
+      // if the instruction supports simm8.
+      // op r/m, imm8
+      if (op_def.simm8 != 0xCC &&
+          ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+           (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+      {
+        emit->Write8(op_def.simm8);
+        immToWrite = 8;
+      }
+      else
+      {
+        // mov reg, imm
+        if (!scale && op == NormalOp::MOV && bits != 64)
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op eax, imm
+        if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC)
+        {
+          emit->Write8(op_def.eaximm32);
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op r/m, imm
+        emit->Write8(op_def.imm32);
+        immToWrite = bits == 16 ? 16 : 32;
+      }
+    }
+    else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+             (operand.scale == SCALE_IMM8 && bits == 32) ||
+             (operand.scale == SCALE_IMM8 && bits == 64))
+    {
+      // op r/m, imm8
+      emit->Write8(op_def.simm8);
+      immToWrite = 8;
+    }
+    else if (operand.scale == SCALE_IMM64 && bits == 64)
+    {
+      if (scale)
+      {
+        ASSERT_MSG(DYNA_REC, 0,
+                   "WriteNormalOp - MOV with 64-bit imm requires register destination");
+      }
+      // mov reg64, imm64
+      else if (op == NormalOp::MOV)
+      {
+        // movabs reg64, imm64 (10 bytes)
+        if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset))
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          emit->Write64(operand.offset);
+          return;
+        }
+        // mov reg64, simm32 (7 bytes)
+        emit->Write8(op_def.imm32);
+        immToWrite = 32;
+      }
+      else
+      {
+        ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+      }
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+    }
+
+    // pass extension in REG of ModRM
+    _operandReg = static_cast<X64Reg>(op_def.ext);
+  }
+  else
+  {
+    _operandReg = (X64Reg)operand.offsetOrBaseReg;
+    WriteREX(emit, bits, bits, _operandReg);
+    // op r/m, reg
+    if (toRM)
+    {
+      emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32);
+    }
+    // op reg, r/m
+    else
+    {
+      emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32);
+    }
+  }
+  WriteRest(emit, immToWrite >> 3, _operandReg);
+  switch (immToWrite)
+  {
+  case 0:
+    break;
+  case 8:
+    emit->Write8((u8)operand.offset);
+    break;
+  case 16:
+    emit->Write16((u16)operand.offset);
+    break;
+  case 32:
+    emit->Write32((u32)operand.offset);
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+  }
+}
+
+void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
+{
+  if (a1.IsImm())
+  {
+    // Booh! Can't write to an imm
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+    return;
+  }
+  if (a2.IsImm())
+  {
+    a1.WriteNormalOp(this, true, op, a2, bits);
+  }
+  else
+  {
+    if (a1.IsSimpleReg())
+    {
+      a2.WriteNormalOp(this, false, op, a1, bits);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(),
+                 "WriteNormalOp - a1 and a2 cannot both be memory");
+      a1.WriteNormalOp(this, true, op, a2, bits);
+    }
+  }
+}
+
+void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADD, a1, a2);
+}
+void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADC, a1, a2);
+}
+void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SUB, a1, a2);
+}
+void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SBB, a1, a2);
+}
+void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::AND, a1, a2);
+}
+void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::OR, a1, a2);
+}
+void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::XOR, a1, a2);
+}
+void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2)
+{
+  if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 &&
+      a2.offset == static_cast<u32>(a2.offset))
+  {
+    WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32());
+    return;
+  }
+  if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+  {
+    ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+  }
+  WriteNormalOp(bits, NormalOp::MOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::TEST, a1, a2);
+}
+void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2)
+{
+  WriteNormalOp(bits, NormalOp::XCHG, a1, a2);
+}
+void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (a1.IsSimpleReg() && a2.IsZero())  // turn 'CMP reg, 0' into shorter 'TEST reg, reg'
+  {
+    WriteNormalOp(bits, NormalOp::TEST, a1, a1);
+  }
+  else
+  {
+    WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+  }
+}
+
+void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2)
+{
+  // This stomps on flags, so ensure they aren't locked
+  DEBUG_ASSERT(!flags_locked);
+
+  // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero
+  // or a2 == dest && a1 == zero)
+  if (a1.IsZero())
+  {
+    if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a2);
+    }
+    return;
+  }
+  if (a2.IsZero())
+  {
+    if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a1);
+    }
+    return;
+  }
+
+  // If dest == a1 or dest == a2 we can simplify this
+  if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a2);
+    return;
+  }
+
+  if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a1);
+    return;
+  }
+
+  // TODO: 32-bit optimizations may apply to other bit sizes (confirm)
+  if (bits == 32)
+  {
+    if (a1.IsImm() && a2.IsImm())
+    {
+      MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsImm())
+    {
+      LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsImm() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32()));
+      return;
+    }
+  }
+
+  // Fallback
+  MOV(bits, R(dest), a1);
+  ADD(bits, R(dest), a2);
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a1.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+    return;
+  }
+
+  if (!a2.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!");
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a1.WriteREX(this, bits, bits, regOp);
+
+  if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+      (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+  {
+    Write8(0x6B);
+    a1.WriteRest(this, 1, regOp);
+    Write8((u8)a2.offset);
+  }
+  else
+  {
+    Write8(0x69);
+    if (a2.GetImmBits() == 16 && bits == 16)
+    {
+      a1.WriteRest(this, 2, regOp);
+      Write16((u16)a2.offset);
+    }
+    else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+    {
+      a1.WriteRest(this, 4, regOp);
+      Write32((u32)a2.offset);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!");
+    }
+  }
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a.IsImm())
+  {
+    IMUL(bits, regOp, R(regOp), a);
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a.WriteREX(this, bits, bits, regOp);
+  Write8(0x0F);
+  Write8(0xAF);
+  a.WriteRest(this, 0, regOp);
+}
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+  if (opPrefix)
+    Write8(opPrefix);
+  arg.operandReg = regOp;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  if (op > 0xFF)
+    Write8((op >> 8) & 0xFF);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+  // Currently, only 0x38 and 0x3A are used as secondary escape byte.
+  if ((op >> 8) == 0x3A)
+    return 3;
+  else if ((op >> 8) == 0x38)
+    return 2;
+  else
+    return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+  if (opPrefix == 0x66)
+    return 1;
+  else if (opPrefix == 0xF3)
+    return 2;
+  else if (opPrefix == 0xF2)
+    return 3;
+  else
+    return 0;
+}
+
+void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  int mmmmm = GetVEXmmmmm(op);
+  int pp = GetVEXpp(opPrefix);
+  // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+  arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1);
+  Write8((u8)regOp3 << 4);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
+}
+
+void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
+{
+  if (!cpu_info.bFMA)
+    PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
+}
+
+void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           int W)
+{
+  if (!cpu_info.bFMA4)
+    PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
+}
+
+void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                          const OpArg& arg, int extrabytes)
+{
+  if (arg.IsImm())
+    PanicAlert("BMI1/2 instructions don't support immediate operands.");
+  if (size != 32 && size != 64)
+    PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!");
+  int W = size == 64;
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bBMI2)
+    PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6E, dest, arg, 0);
+}
+void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src)
+{
+  WriteSSEOp(0x66, 0x7E, src, arg, 0);
+}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+  // Alternate encoding
+  // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+  arg.operandReg = dest;
+  Write8(0x66);
+  arg.WriteREX(this, 64, 0);
+  Write8(0x0f);
+  Write8(0x6E);
+  arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+  if (src > 7 || arg.IsSimpleReg())
+  {
+    // Alternate encoding
+    // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+    arg.operandReg = src;
+    Write8(0x66);
+    arg.WriteREX(this, 64, 0);
+    Write8(0x0f);
+    Write8(0x7E);
+    arg.WriteRest(this, 0);
+  }
+  else
+  {
+    arg.operandReg = src;
+    arg.WriteREX(this, 0, 0);
+    Write8(0x66);
+    Write8(0x0f);
+    Write8(0xD6);
+    arg.WriteRest(this, 0);
+  }
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+  if (arg.IsImm() || arg.IsSimpleReg())
+    ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand");
+
+  arg.operandReg = ext;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0xAE);
+  arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 3);
+}
+void XEmitter::LDMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 2);
+}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);
+}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVNTP, regOp, arg);
+}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTP, regOp, arg);
+}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseADD, regOp, arg);
+}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseADD, regOp, arg);
+}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSUB, regOp, arg);
+}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSUB, regOp, arg);
+}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF3, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF2, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMUL, regOp, arg);
+}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMUL, regOp, arg);
+}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseDIV, regOp, arg);
+}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseDIV, regOp, arg);
+}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMIN, regOp, arg);
+}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMIN, regOp, arg);
+}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMAX, regOp, arg);
+}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRSQRT, regOp, arg);
+}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseADD, regOp, arg);
+}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseADD, regOp, arg);
+}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSUB, regOp, arg);
+}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSUB, regOp, arg);
+}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x00, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x66, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseAND, regOp, arg);
+}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseAND, regOp, arg);
+}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseANDN, regOp, arg);
+}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseANDN, regOp, arg);
+}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseOR, regOp, arg);
+}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseOR, regOp, arg);
+}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseXOR, regOp, arg);
+}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseXOR, regOp, arg);
+}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMUL, regOp, arg);
+}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMUL, regOp, arg);
+}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseDIV, regOp, arg);
+}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseDIV, regOp, arg);
+}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMIN, regOp, arg);
+}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMIN, regOp, arg);
+}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMAX, regOp, arg);
+}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRSQRT, regOp, arg);
+}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x00, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseCOMIS, regOp, arg);
+}  // weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseCOMIS, regOp, arg);
+}  // ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseUCOMIS, regOp, arg);
+}  // unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseUCOMIS, regOp, arg);
+}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);
+}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);
+}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);
+}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg);
+}
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg);
+}
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));
+}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));
+}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5A, regOp, arg);
+}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5A, regOp, arg);
+}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2A, regOp, arg);
+}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2A, regOp, arg);
+}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0xE6, regOp, arg);
+}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5B, regOp, arg);
+}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0xE6, regOp, arg);
+}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5B, regOp, arg);
+}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5B, regOp, arg);
+}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE6, regOp, arg);
+}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)
+{
+  WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));
+}
+
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x50, dest, arg);
+}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x50, dest, arg);
+}
+
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseLDDQU, dest, arg);
+}  // For integer data only
+
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x15, dest, arg);
+}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x15, dest, arg);
+}
+
+// Pretty much every x86 CPU nowadays supports SSE3,
+// but the SSE2 fallbacks are easy.
+void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKLPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x16, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKHPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF2, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVSD(regOp, arg);
+    UNPCKLPD(regOp, R(regOp));
+  }
+}
+
+// There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6B, dest, arg);
+}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x63, dest, arg);
+}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x67, dest, arg);
+}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x60, dest, arg);
+}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x61, dest, arg);
+}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x62, dest, arg);
+}
+void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6C, dest, arg);
+}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAW-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x71);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAD-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x72);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSSE3)
+    PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSE4_1)
+    PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSSE3Op(0x66, 0x3800, dest, arg);
+}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3817, dest, arg);
+}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x382b, dest, arg);
+}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3820, dest, arg);
+}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3821, dest, arg);
+}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3822, dest, arg);
+}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3823, dest, arg);
+}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3824, dest, arg);
+}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3825, dest, arg);
+}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3830, dest, arg);
+}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3831, dest, arg);
+}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3832, dest, arg);
+}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3833, dest, arg);
+}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3834, dest, arg);
+}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3835, dest, arg);
+}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3810, dest, arg);
+}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3814, dest, arg);
+}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3815, dest, arg);
+}
+void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1);
+  Write8(blend);
+}
+void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1);
+  Write8(blend);
+}
+
+void XEmitter::PAND(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDB, dest, arg);
+}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDF, dest, arg);
+}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEF, dest, arg);
+}
+void XEmitter::POR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEB, dest, arg);
+}
+
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFC, dest, arg);
+}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFD, dest, arg);
+}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFE, dest, arg);
+}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD4, dest, arg);
+}
+
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEC, dest, arg);
+}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xED, dest, arg);
+}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDC, dest, arg);
+}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDD, dest, arg);
+}
+
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF8, dest, arg);
+}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF9, dest, arg);
+}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFA, dest, arg);
+}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFB, dest, arg);
+}
+
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE8, dest, arg);
+}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE9, dest, arg);
+}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD8, dest, arg);
+}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD9, dest, arg);
+}
+
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE0, dest, arg);
+}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE3, dest, arg);
+}
+
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x74, dest, arg);
+}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x75, dest, arg);
+}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x76, dest, arg);
+}
+
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x64, dest, arg);
+}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x65, dest, arg);
+}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x66, dest, arg);
+}
+
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC5, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC4, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSE41Op(0x66, 0x3A22, dest, arg);
+  Write8(subreg);
+}
+
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF5, dest, arg);
+}
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF6, dest, arg);
+}
+
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEE, dest, arg);
+}
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDE, dest, arg);
+}
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEA, dest, arg);
+}
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDA, dest, arg);
+}
+
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD7, dest, arg);
+}
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF2, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF3, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+// VEX
+void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);
+}
+void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare)
+{
+  WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1);
+  Write8(compare);
+}
+void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);
+}
+void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3)
+{
+  WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3);
+}
+void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);
+}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);
+}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);
+}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);
+}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
+}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);
+}
+
+#define FMA4(name, op)                                                                             \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);                                                 \
+  }                                                                                                \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);                                                 \
+  }
+
+FMA4(VFMADDSUBPS, 0x5C)
+FMA4(VFMADDSUBPD, 0x5D)
+FMA4(VFMSUBADDPS, 0x5E)
+FMA4(VFMSUBADDPD, 0x5F)
+FMA4(VFMADDPS, 0x68)
+FMA4(VFMADDPD, 0x69)
+FMA4(VFMADDSS, 0x6A)
+FMA4(VFMADDSD, 0x6B)
+FMA4(VFMSUBPS, 0x6C)
+FMA4(VFMSUBPD, 0x6D)
+FMA4(VFMSUBSS, 0x6E)
+FMA4(VFMSUBSD, 0x6F)
+FMA4(VFNMADDPS, 0x78)
+FMA4(VFNMADDPD, 0x79)
+FMA4(VFNMADDSS, 0x7A)
+FMA4(VFNMADDSD, 0x7B)
+FMA4(VFNMSUBPS, 0x7C)
+FMA4(VFNMSUBPD, 0x7D)
+FMA4(VFNMSUBSS, 0x7E)
+FMA4(VFNMSUBSD, 0x7F)
+#undef FMA4
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate)
+{
+  WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1);
+  Write8(rotate);
+}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);
+}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  CheckFlags();
+  WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);
+}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);
+}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);
+}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);
+}
+
+// Prefixes
+
+void XEmitter::LOCK()
+{
+  Write8(0xF0);
+}
+void XEmitter::REP()
+{
+  Write8(0xF3);
+}
+void XEmitter::REPNE()
+{
+  Write8(0xF2);
+}
+void XEmitter::FSOverride()
+{
+  Write8(0x64);
+}
+void XEmitter::GSOverride()
+{
+  Write8(0x65);
+}
+
+void XEmitter::FWAIT()
+{
+  Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
+{
+  int mf = 0;
+  ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid),
+             "WriteFloatLoadStore: 80 bits not supported for this instruction");
+  switch (bits)
+  {
+  case 32:
+    mf = 0;
+    break;
+  case 64:
+    mf = 4;
+    break;
+  case 80:
+    mf = 2;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+  }
+  Write8(0xd9 | mf);
+  // x87 instructions use the reg field of the ModR/M byte as opcode:
+  if (bits == 80)
+    op = op_80b;
+  arg.WriteRest(this, 0, static_cast<X64Reg>(op));
+}
+
+void XEmitter::FLD(int bits, const OpArg& src)
+{
+  WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src);
+}
+void XEmitter::FST(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest);
+}
+void XEmitter::FSTP(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest);
+}
+void XEmitter::FNSTSW_AX()
+{
+  Write8(0xDF);
+  Write8(0xE0);
+}
+
+void XEmitter::RDTSC()
+{
+  Write8(0x0F);
+  Write8(0x31);
+}
+}
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
new file mode 100644
index 0000000..122850d
--- /dev/null
+++ b/src/dolphin/x64Emitter.h
@@ -0,0 +1,1180 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <type_traits>
+
+#include "Assert.h"
+#include "BitSet.h"
+#include "CodeBlock.h"
+#include "../types.h"
+#include "x64ABI.h"
+
+namespace Gen
+{
+enum CCFlags
+{
+  CC_O = 0,
+  CC_NO = 1,
+  CC_B = 2,
+  CC_C = 2,
+  CC_NAE = 2,
+  CC_NB = 3,
+  CC_NC = 3,
+  CC_AE = 3,
+  CC_Z = 4,
+  CC_E = 4,
+  CC_NZ = 5,
+  CC_NE = 5,
+  CC_BE = 6,
+  CC_NA = 6,
+  CC_NBE = 7,
+  CC_A = 7,
+  CC_S = 8,
+  CC_NS = 9,
+  CC_P = 0xA,
+  CC_PE = 0xA,
+  CC_NP = 0xB,
+  CC_PO = 0xB,
+  CC_L = 0xC,
+  CC_NGE = 0xC,
+  CC_NL = 0xD,
+  CC_GE = 0xD,
+  CC_LE = 0xE,
+  CC_NG = 0xE,
+  CC_NLE = 0xF,
+  CC_G = 0xF
+};
+
+enum
+{
+  NUMGPRs = 16,
+  NUMXMMs = 16,
+};
+
+enum
+{
+  SCALE_NONE = 0,
+  SCALE_1 = 1,
+  SCALE_2 = 2,
+  SCALE_4 = 4,
+  SCALE_8 = 8,
+  SCALE_ATREG = 16,
+  // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+  SCALE_NOBASE_2 = 34,
+  SCALE_NOBASE_4 = 36,
+  SCALE_NOBASE_8 = 40,
+  SCALE_RIP = 0xFF,
+  SCALE_IMM8 = 0xF0,
+  SCALE_IMM16 = 0xF1,
+  SCALE_IMM32 = 0xF2,
+  SCALE_IMM64 = 0xF3,
+};
+
+enum SSECompare
+{
+  CMP_EQ = 0,
+  CMP_LT = 1,
+  CMP_LE = 2,
+  CMP_UNORD = 3,
+  CMP_NEQ = 4,
+  CMP_NLT = 5,
+  CMP_NLE = 6,
+  CMP_ORD = 7,
+};
+
+class XEmitter;
+enum class FloatOp;
+enum class NormalOp;
+
+// Information about a generated MOV op
+struct MovInfo final
+{
+  u8* address;
+  bool nonAtomicSwapStore;
+  // valid iff nonAtomicSwapStore is true
+  X64Reg nonAtomicSwapStoreSrc;
+};
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+  // For accessing offset and operandReg.
+  // This also allows us to keep the op writing functions private.
+  friend class XEmitter;
+
+  // dummy op arg, used for storage
+  constexpr OpArg() = default;
+  constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX)
+      : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)},
+        indexReg{static_cast<u16>(scaled_reg)}, offset{offset_}
+  {
+  }
+  constexpr bool operator==(const OpArg& b) const
+  {
+    // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately,
+    // (because we still support some older versions of GCC where std::tie is not constexpr.)
+    return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+           indexReg == b.indexReg && offset == b.offset;
+  }
+  constexpr bool operator!=(const OpArg& b) const { return !operator==(b); }
+  u64 Imm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (u64)offset;
+  }
+  u32 Imm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (u32)offset;
+  }
+  u16 Imm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (u16)offset;
+  }
+  u8 Imm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (u8)offset;
+  }
+
+  s64 SImm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (s64)offset;
+  }
+  s32 SImm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (s32)offset;
+  }
+  s16 SImm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (s16)offset;
+  }
+  s8 SImm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (s8)offset;
+  }
+
+  OpArg AsImm64() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u64)offset, SCALE_IMM64);
+  }
+  OpArg AsImm32() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u32)offset, SCALE_IMM32);
+  }
+  OpArg AsImm16() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u16)offset, SCALE_IMM16);
+  }
+  OpArg AsImm8() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u8)offset, SCALE_IMM8);
+  }
+
+  constexpr bool IsImm() const
+  {
+    return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+           scale == SCALE_IMM64;
+  }
+  constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+  constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; }
+  constexpr bool IsZero() const { return IsImm() && offset == 0; }
+  constexpr int GetImmBits() const
+  {
+    switch (scale)
+    {
+    case SCALE_IMM8:
+      return 8;
+    case SCALE_IMM16:
+      return 16;
+    case SCALE_IMM32:
+      return 32;
+    case SCALE_IMM64:
+      return 64;
+    default:
+      return -1;
+    }
+  }
+
+  constexpr X64Reg GetSimpleReg() const
+  {
+    if (scale == SCALE_NONE)
+      return static_cast<X64Reg>(offsetOrBaseReg);
+
+    return INVALID_REG;
+  }
+
+  void AddMemOffset(int val)
+  {
+    DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE),
+                     "Tried to increment an OpArg which doesn't have an offset");
+    offset += val;
+  }
+
+private:
+  void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+  void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                int W = 0) const;
+  void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+                 bool warn_64bit_offset = true) const;
+  void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+  void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const;
+
+  u8 scale = 0;
+  u16 offsetOrBaseReg = 0;
+  u16 indexReg = 0;
+  u64 offset = 0;  // Also used to store immediates.
+  u16 operandReg = 0;
+};
+
+template <typename T>
+inline OpArg M(const T* ptr)
+{
+  return OpArg((u64)(const void*)ptr, (int)SCALE_RIP);
+}
+constexpr OpArg R(X64Reg value)
+{
+  return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value)
+{
+  return OpArg(0, SCALE_ATREG, value);
+}
+
+constexpr OpArg MDisp(X64Reg value, int offset)
+{
+  return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
+}
+
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+  return OpArg(offset, scale, base, scaled);
+}
+
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+  if (scale == SCALE_1)
+    return OpArg(offset, SCALE_ATREG, scaled);
+
+  return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+  return MComplex(base, offset, 1, 0);
+}
+
+constexpr OpArg Imm8(u8 imm)
+{
+  return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm)
+{
+  return OpArg(imm, SCALE_IMM16);
+}  // rarely used
+constexpr OpArg Imm32(u32 imm)
+{
+  return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm)
+{
+  return OpArg(imm, SCALE_IMM64);
+}
+inline OpArg ImmPtr(const void* imm)
+{
+  return Imm64(reinterpret_cast<u64>(imm));
+}
+
+inline u32 PtrOffset(const void* ptr, const void* base = nullptr)
+{
+  s64 distance = (s64)ptr - (s64)base;
+  if (distance >= 0x80000000LL || distance < -0x80000000LL)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range");
+    return 0;
+  }
+
+  return (u32)distance;
+}
+
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
+
+struct FixupBranch
+{
+  enum class Type
+  {
+    Branch8Bit,
+    Branch32Bit
+  };
+
+  u8* ptr;
+  Type type;
+};
+
+class XEmitter
+{
+  friend struct OpArg;  // for Write8 etc
+private:
+  u8* code = nullptr;
+  bool flags_locked = false;
+
+  void CheckFlags();
+
+  void Rex(int w, int r, int x, int b);
+  void WriteModRM(int mod, int reg, int rm);
+  void WriteSIB(int scale, int index, int base);
+  void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+  void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+  void WriteMulDivType(int bits, OpArg src, int ext);
+  void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+  void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+  void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
+  void WriteMXCSR(OpArg arg, int ext);
+  void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+  void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                  int extrabytes = 0);
+  void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg);
+  void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+  void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+
+  void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                              size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+  void Write8(u8 value);
+  void Write16(u16 value);
+  void Write32(u32 value);
+  void Write64(u64 value);
+
+public:
+  XEmitter() = default;
+  explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
+  virtual ~XEmitter() = default;
+  void SetCodePtr(u8* ptr);
+  void ReserveCodeSpace(int bytes);
+  u8* AlignCodeTo(size_t alignment);
+  u8* AlignCode4();
+  u8* AlignCode16();
+  u8* AlignCodePage();
+  const u8* GetCodePtr() const;
+  u8* GetWritableCodePtr();
+
+  void LockFlags() { flags_locked = true; }
+  void UnlockFlags() { flags_locked = false; }
+  // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+  // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+  // string instr.,
+  // INC and DEC are slow on Intel Core, but not on AMD. They create a
+  // false flag dependency because they only update a subset of the flags.
+  // XCHG is SLOW and should be avoided.
+
+  // Debug breakpoint
+  void INT3();
+
+  // Do nothing
+  void NOP(size_t count = 1);
+
+  // Save energy in wait-loops on P4 only. Probably not too useful.
+  void PAUSE();
+
+  // Flag control
+  void STC();
+  void CLC();
+  void CMC();
+
+  // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+  // AMD!
+  void LAHF();  // 3 cycle vector path
+  void SAHF();  // direct path fast
+
+  // Stack control
+  void PUSH(X64Reg reg);
+  void POP(X64Reg reg);
+  void PUSH(int bits, const OpArg& reg);
+  void POP(int bits, const OpArg& reg);
+  void PUSHF();
+  void POPF();
+
+  // Flow control
+  void RET();
+  void RET_FAST();
+  void UD2();
+  FixupBranch J(bool force5bytes = false);
+
+  void JMP(const u8* addr, bool force5Bytes = false);
+  void JMPptr(const OpArg& arg);
+  void JMPself();  // infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+  void CALL(const void* fnptr);
+  FixupBranch CALL();
+  void CALLptr(OpArg arg);
+
+  FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+  void J_CC(CCFlags conditionCode, const u8* addr);
+
+  void SetJumpTarget(const FixupBranch& branch);
+
+  void SETcc(CCFlags flag, OpArg dest);
+  // Note: CMOV brings small if any benefit on current CPUs.
+  void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+  // Fences
+  void LFENCE();
+  void MFENCE();
+  void SFENCE();
+
+  // Bit scan
+  void BSF(int bits, X64Reg dest, const OpArg& src);  // Bottom bit to top bit
+  void BSR(int bits, X64Reg dest, const OpArg& src);  // Top bit to bottom bit
+
+  // Cache control
+  enum PrefetchLevel
+  {
+    PF_NTA,  // Non-temporal (data used once and only once)
+    PF_T0,   // All cache levels
+    PF_T1,   // Levels 2+ (aliased to T0 on AMD)
+    PF_T2,   // Levels 3+ (aliased to T0 on AMD)
+  };
+  void PREFETCH(PrefetchLevel level, OpArg arg);
+  void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+  void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+  void MOVNTPS(const OpArg& arg, X64Reg regOp);
+  void MOVNTPD(const OpArg& arg, X64Reg regOp);
+
+  // Multiplication / division
+  void MUL(int bits, const OpArg& src);   // UNSIGNED
+  void IMUL(int bits, const OpArg& src);  // SIGNED
+  void IMUL(int bits, X64Reg regOp, const OpArg& src);
+  void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+  void DIV(int bits, const OpArg& src);
+  void IDIV(int bits, const OpArg& src);
+
+  // Shift
+  void ROL(int bits, const OpArg& dest, const OpArg& shift);
+  void ROR_(int bits, const OpArg& dest, const OpArg& shift);
+  void RCL(int bits, const OpArg& dest, const OpArg& shift);
+  void RCR(int bits, const OpArg& dest, const OpArg& shift);
+  void SHL(int bits, const OpArg& dest, const OpArg& shift);
+  void SHR(int bits, const OpArg& dest, const OpArg& shift);
+  void SAR(int bits, const OpArg& dest, const OpArg& shift);
+
+  // Bit Test
+  void BT(int bits, const OpArg& dest, const OpArg& index);
+  void BTS(int bits, const OpArg& dest, const OpArg& index);
+  void BTR(int bits, const OpArg& dest, const OpArg& index);
+  void BTC(int bits, const OpArg& dest, const OpArg& index);
+
+  // Double-Precision Shift
+  void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+  void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+
+  // Extend EAX into EDX in various ways
+  void CWD(int bits = 16);
+  inline void CDQ() { CWD(32); }
+  inline void CQO() { CWD(64); }
+  void CBW(int bits = 8);
+  inline void CWDE() { CBW(16); }
+  inline void CDQE() { CBW(32); }
+  // Load effective address
+  void LEA(int bits, X64Reg dest, OpArg src);
+
+  // Integer arithmetic
+  void NEG(int bits, const OpArg& src);
+  void ADD(int bits, const OpArg& a1, const OpArg& a2);
+  void ADC(int bits, const OpArg& a1, const OpArg& a2);
+  void SUB(int bits, const OpArg& a1, const OpArg& a2);
+  void SBB(int bits, const OpArg& a1, const OpArg& a2);
+  void AND(int bits, const OpArg& a1, const OpArg& a2);
+  void CMP(int bits, const OpArg& a1, const OpArg& a2);
+
+  // Bit operations
+  void NOT(int bits, const OpArg& src);
+  void OR(int bits, const OpArg& a1, const OpArg& a2);
+  void XOR(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV(int bits, const OpArg& a1, const OpArg& a2);
+  void TEST(int bits, const OpArg& a1, const OpArg& a2);
+
+  void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2);
+
+  // Are these useful at all? Consider removing.
+  void XCHG(int bits, const OpArg& a1, const OpArg& a2);
+  void XCHG_AHAL();
+
+  // Byte swapping (32 and 64-bit only).
+  void BSWAP(int bits, X64Reg reg);
+
+  // Sign/zero extension
+  void MOVSX(int dbits, int sbits, X64Reg dest,
+             OpArg src);  // automatically uses MOVSXD if necessary
+  void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+  // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+  void MOVBE(int bits, X64Reg dest, const OpArg& src);
+  void MOVBE(int bits, const OpArg& dest, X64Reg src);
+  void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false,
+                   MovInfo* info = nullptr);
+  void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr);
+
+  // Available only on AMD >= Phenom or Intel >= Haswell
+  void LZCNT(int bits, X64Reg dest, const OpArg& src);
+  // Note: this one is actually part of BMI1
+  void TZCNT(int bits, X64Reg dest, const OpArg& src);
+
+  // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+  void STMXCSR(const OpArg& memloc);
+  void LDMXCSR(const OpArg& memloc);
+
+  // Prefixes
+  void LOCK();
+  void REP();
+  void REPNE();
+  void FSOverride();
+  void GSOverride();
+
+  // x87
+  enum x87StatusWordBits
+  {
+    x87_InvalidOperation = 0x1,
+    x87_DenormalizedOperand = 0x2,
+    x87_DivisionByZero = 0x4,
+    x87_Overflow = 0x8,
+    x87_Underflow = 0x10,
+    x87_Precision = 0x20,
+    x87_StackFault = 0x40,
+    x87_ErrorSummary = 0x80,
+    x87_C0 = 0x100,
+    x87_C1 = 0x200,
+    x87_C2 = 0x400,
+    x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+    x87_C3 = 0x4000,
+    x87_FPUBusy = 0x8000,
+  };
+
+  void FLD(int bits, const OpArg& src);
+  void FST(int bits, const OpArg& dest);
+  void FSTP(int bits, const OpArg& dest);
+  void FNSTSW_AX();
+  void FWAIT();
+
+  // SSE/SSE2: Floating point arithmetic
+  void ADDSS(X64Reg regOp, const OpArg& arg);
+  void ADDSD(X64Reg regOp, const OpArg& arg);
+  void SUBSS(X64Reg regOp, const OpArg& arg);
+  void SUBSD(X64Reg regOp, const OpArg& arg);
+  void MULSS(X64Reg regOp, const OpArg& arg);
+  void MULSD(X64Reg regOp, const OpArg& arg);
+  void DIVSS(X64Reg regOp, const OpArg& arg);
+  void DIVSD(X64Reg regOp, const OpArg& arg);
+  void MINSS(X64Reg regOp, const OpArg& arg);
+  void MINSD(X64Reg regOp, const OpArg& arg);
+  void MAXSS(X64Reg regOp, const OpArg& arg);
+  void MAXSD(X64Reg regOp, const OpArg& arg);
+  void SQRTSS(X64Reg regOp, const OpArg& arg);
+  void SQRTSD(X64Reg regOp, const OpArg& arg);
+  void RCPSS(X64Reg regOp, const OpArg& arg);
+  void RSQRTSS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point bitwise (yes)
+  void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
+
+  // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+  void ADDPS(X64Reg regOp, const OpArg& arg);
+  void ADDPD(X64Reg regOp, const OpArg& arg);
+  void SUBPS(X64Reg regOp, const OpArg& arg);
+  void SUBPD(X64Reg regOp, const OpArg& arg);
+  void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+  void MULPS(X64Reg regOp, const OpArg& arg);
+  void MULPD(X64Reg regOp, const OpArg& arg);
+  void DIVPS(X64Reg regOp, const OpArg& arg);
+  void DIVPD(X64Reg regOp, const OpArg& arg);
+  void MINPS(X64Reg regOp, const OpArg& arg);
+  void MINPD(X64Reg regOp, const OpArg& arg);
+  void MAXPS(X64Reg regOp, const OpArg& arg);
+  void MAXPD(X64Reg regOp, const OpArg& arg);
+  void SQRTPS(X64Reg regOp, const OpArg& arg);
+  void SQRTPD(X64Reg regOp, const OpArg& arg);
+  void RCPPS(X64Reg regOp, const OpArg& arg);
+  void RSQRTPS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+  void ANDPS(X64Reg regOp, const OpArg& arg);
+  void ANDPD(X64Reg regOp, const OpArg& arg);
+  void ANDNPS(X64Reg regOp, const OpArg& arg);
+  void ANDNPD(X64Reg regOp, const OpArg& arg);
+  void ORPS(X64Reg regOp, const OpArg& arg);
+  void ORPD(X64Reg regOp, const OpArg& arg);
+  void XORPS(X64Reg regOp, const OpArg& arg);
+  void XORPD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+  void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+  void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
+
+  // SSE3
+  void MOVSLDUP(X64Reg regOp, const OpArg& arg);
+  void MOVSHDUP(X64Reg regOp, const OpArg& arg);
+  void MOVDDUP(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Useful alternative to shuffle in some cases.
+  void UNPCKLPS(X64Reg dest, const OpArg& src);
+  void UNPCKHPS(X64Reg dest, const OpArg& src);
+  void UNPCKLPD(X64Reg dest, const OpArg& src);
+  void UNPCKHPD(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Compares.
+  void COMISS(X64Reg regOp, const OpArg& arg);
+  void COMISD(X64Reg regOp, const OpArg& arg);
+  void UCOMISS(X64Reg regOp, const OpArg& arg);
+  void UCOMISD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+  void MOVAPS(X64Reg regOp, const OpArg& arg);
+  void MOVAPD(X64Reg regOp, const OpArg& arg);
+  void MOVAPS(const OpArg& arg, X64Reg regOp);
+  void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVUPS(X64Reg regOp, const OpArg& arg);
+  void MOVUPD(X64Reg regOp, const OpArg& arg);
+  void MOVUPS(const OpArg& arg, X64Reg regOp);
+  void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVDQA(X64Reg regOp, const OpArg& arg);
+  void MOVDQA(const OpArg& arg, X64Reg regOp);
+  void MOVDQU(X64Reg regOp, const OpArg& arg);
+  void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+  void MOVSS(X64Reg regOp, const OpArg& arg);
+  void MOVSD(X64Reg regOp, const OpArg& arg);
+  void MOVSS(const OpArg& arg, X64Reg regOp);
+  void MOVSD(const OpArg& arg, X64Reg regOp);
+
+  void MOVLPS(X64Reg regOp, const OpArg& arg);
+  void MOVLPD(X64Reg regOp, const OpArg& arg);
+  void MOVLPS(const OpArg& arg, X64Reg regOp);
+  void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHPS(X64Reg regOp, const OpArg& arg);
+  void MOVHPD(X64Reg regOp, const OpArg& arg);
+  void MOVHPS(const OpArg& arg, X64Reg regOp);
+  void MOVHPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+  void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+  // Be careful when using these overloads for reg <--> xmm moves.
+  // The one you cast to OpArg with R(reg) is the x86 reg, the other
+  // one is the xmm reg.
+  // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx)
+  //     use "MOVD_xmm(R(eax), xmm1)" instead.
+  void MOVD_xmm(X64Reg dest, const OpArg& arg);
+  void MOVQ_xmm(X64Reg dest, OpArg arg);
+  void MOVD_xmm(const OpArg& arg, X64Reg src);
+  void MOVQ_xmm(OpArg arg, X64Reg src);
+
+  // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+  // question.
+  void MOVMSKPS(X64Reg dest, const OpArg& arg);
+  void MOVMSKPD(X64Reg dest, const OpArg& arg);
+
+  // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+  // weird one.
+  void MASKMOVDQU(X64Reg dest, X64Reg src);
+  void LDDQU(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Data type conversions.
+  void CVTPS2PD(X64Reg dest, const OpArg& src);
+  void CVTPD2PS(X64Reg dest, const OpArg& src);
+  void CVTSS2SD(X64Reg dest, const OpArg& src);
+  void CVTSI2SS(X64Reg dest, const OpArg& src);
+  void CVTSD2SS(X64Reg dest, const OpArg& src);
+  void CVTSI2SD(X64Reg dest, const OpArg& src);
+  void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+  void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+  void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+  void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
+
+  // Destinations are X64 regs (rax, rbx, ...) for these instructions.
+  void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+  void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+  void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+  void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
+
+  // SSE2: Packed integer instructions
+  void PACKSSDW(X64Reg dest, const OpArg& arg);
+  void PACKSSWB(X64Reg dest, const OpArg& arg);
+  void PACKUSDW(X64Reg dest, const OpArg& arg);
+  void PACKUSWB(X64Reg dest, const OpArg& arg);
+
+  void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+  void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+  void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+  void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
+
+  void PTEST(X64Reg dest, const OpArg& arg);
+  void PAND(X64Reg dest, const OpArg& arg);
+  void PANDN(X64Reg dest, const OpArg& arg);
+  void PXOR(X64Reg dest, const OpArg& arg);
+  void POR(X64Reg dest, const OpArg& arg);
+
+  void PADDB(X64Reg dest, const OpArg& arg);
+  void PADDW(X64Reg dest, const OpArg& arg);
+  void PADDD(X64Reg dest, const OpArg& arg);
+  void PADDQ(X64Reg dest, const OpArg& arg);
+
+  void PADDSB(X64Reg dest, const OpArg& arg);
+  void PADDSW(X64Reg dest, const OpArg& arg);
+  void PADDUSB(X64Reg dest, const OpArg& arg);
+  void PADDUSW(X64Reg dest, const OpArg& arg);
+
+  void PSUBB(X64Reg dest, const OpArg& arg);
+  void PSUBW(X64Reg dest, const OpArg& arg);
+  void PSUBD(X64Reg dest, const OpArg& arg);
+  void PSUBQ(X64Reg dest, const OpArg& arg);
+
+  void PSUBSB(X64Reg dest, const OpArg& arg);
+  void PSUBSW(X64Reg dest, const OpArg& arg);
+  void PSUBUSB(X64Reg dest, const OpArg& arg);
+  void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+  void PAVGB(X64Reg dest, const OpArg& arg);
+  void PAVGW(X64Reg dest, const OpArg& arg);
+
+  void PCMPEQB(X64Reg dest, const OpArg& arg);
+  void PCMPEQW(X64Reg dest, const OpArg& arg);
+  void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+  void PCMPGTB(X64Reg dest, const OpArg& arg);
+  void PCMPGTW(X64Reg dest, const OpArg& arg);
+  void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+  void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
+
+  void PMADDWD(X64Reg dest, const OpArg& arg);
+  void PSADBW(X64Reg dest, const OpArg& arg);
+
+  void PMAXSW(X64Reg dest, const OpArg& arg);
+  void PMAXUB(X64Reg dest, const OpArg& arg);
+  void PMINSW(X64Reg dest, const OpArg& arg);
+  void PMINUB(X64Reg dest, const OpArg& arg);
+
+  void PMOVMSKB(X64Reg dest, const OpArg& arg);
+  void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFB(X64Reg dest, const OpArg& arg);
+
+  void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
+
+  void PSRLW(X64Reg reg, int shift);
+  void PSRLD(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, const OpArg& arg);
+  void PSRLDQ(X64Reg reg, int shift);
+
+  void PSLLW(X64Reg reg, int shift);
+  void PSLLD(X64Reg reg, int shift);
+  void PSLLQ(X64Reg reg, int shift);
+  void PSLLDQ(X64Reg reg, int shift);
+
+  void PSRAW(X64Reg reg, int shift);
+  void PSRAD(X64Reg reg, int shift);
+
+  // SSE4: data type conversions
+  void PMOVSXBW(X64Reg dest, const OpArg& arg);
+  void PMOVSXBD(X64Reg dest, const OpArg& arg);
+  void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXWD(X64Reg dest, const OpArg& arg);
+  void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXBW(X64Reg dest, const OpArg& arg);
+  void PMOVZXBD(X64Reg dest, const OpArg& arg);
+  void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXWD(X64Reg dest, const OpArg& arg);
+  void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXDQ(X64Reg dest, const OpArg& arg);
+
+  // SSE4: blend instructions
+  void PBLENDVB(X64Reg dest, const OpArg& arg);
+  void BLENDVPS(X64Reg dest, const OpArg& arg);
+  void BLENDVPD(X64Reg dest, const OpArg& arg);
+  void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
+  void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
+
+  // AVX
+  void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare);
+  void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask);
+  void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+  void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+
+  void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  // FMA3
+  void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+#define FMA4(name)                                                                                 \
+  void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);                          \
+  void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+
+  FMA4(VFMADDSUBPS)
+  FMA4(VFMADDSUBPD)
+  FMA4(VFMSUBADDPS)
+  FMA4(VFMSUBADDPD)
+  FMA4(VFMADDPS)
+  FMA4(VFMADDPD)
+  FMA4(VFMADDSS)
+  FMA4(VFMADDSD)
+  FMA4(VFMSUBPS)
+  FMA4(VFMSUBPD)
+  FMA4(VFMSUBSS)
+  FMA4(VFMSUBSD)
+  FMA4(VFNMADDPS)
+  FMA4(VFNMADDPD)
+  FMA4(VFNMADDSS)
+  FMA4(VFNMADDSD)
+  FMA4(VFNMSUBPS)
+  FMA4(VFNMSUBPD)
+  FMA4(VFNMSUBSS)
+  FMA4(VFNMSUBSD)
+#undef FMA4
+
+  // VEX GPR instructions
+  void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+  void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+  void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void RDTSC();
+
+  // Utility functions
+  // The difference between this and CALL is that this aligns the stack
+  // where appropriate.
+  template <typename FunctionPointer>
+  void ABI_CallFunction(FunctionPointer func)
+  {
+    static_assert(std::is_pointer<FunctionPointer>() &&
+                      std::is_function<std::remove_pointer_t<FunctionPointer>>(),
+                  "Supplied type must be a function pointer.");
+
+    const void* ptr = reinterpret_cast<const void*>(func);
+    const u64 address = reinterpret_cast<u64>(ptr);
+    const u64 distance = address - (reinterpret_cast<u64>(code) + 5);
+
+    if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL)
+    {
+      // Far call
+      MOV(64, R(RAX), Imm64(address));
+      CALLptr(R(RAX));
+    }
+    else
+    {
+      CALL(ptr);
+    }
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC16(FunctionPointer func, u16 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC(FunctionPointer func, u32 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3,
+                            const void* param4)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  // Pass a register as a parameter.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1)
+  {
+    if (reg1 != ABI_PARAM1)
+      MOV(32, R(ABI_PARAM1), R(reg1));
+    ABI_CallFunction(func);
+  }
+
+  // Pass two registers as parameters.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2)
+  {
+    MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2);
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    ABI_CallFunction(func);
+  }
+
+  // Helper method for ABI functions related to calling functions. May be used by itself as well.
+  void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2);
+
+  // Saves/restores the registers and adjusts the stack to be aligned as
+  // required by the ABI, where the previous alignment was as specified.
+  // Push returns the size of the shadow space, i.e. the offset of the frame.
+  size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                         size_t needed_frame_size = 0);
+  void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                      size_t needed_frame_size = 0);
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  template <typename T, typename... Args>
+  void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+  {
+    auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>;
+    ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1);
+  }
+};  // class XEmitter
+
+class X64CodeBlock : public Common::CodeBlock<XEmitter>
+{
+private:
+  void PoisonMemory() override
+  {
+    // x86/64: 0xCC = breakpoint
+    memset(region, 0xCC, region_size);
+  }
+};
+
+}  // namespace
diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h
new file mode 100644
index 0000000..a92e024
--- /dev/null
+++ b/src/dolphin/x64Reg.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+namespace Gen
+{
+enum X64Reg
+{
+  EAX = 0,
+  EBX = 3,
+  ECX = 1,
+  EDX = 2,
+  ESI = 6,
+  EDI = 7,
+  EBP = 5,
+  ESP = 4,
+
+  RAX = 0,
+  RBX = 3,
+  RCX = 1,
+  RDX = 2,
+  RSI = 6,
+  RDI = 7,
+  RBP = 5,
+  RSP = 4,
+  R8 = 8,
+  R9 = 9,
+  R10 = 10,
+  R11 = 11,
+  R12 = 12,
+  R13 = 13,
+  R14 = 14,
+  R15 = 15,
+
+  AL = 0,
+  BL = 3,
+  CL = 1,
+  DL = 2,
+  SIL = 6,
+  DIL = 7,
+  BPL = 5,
+  SPL = 4,
+  AH = 0x104,
+  BH = 0x107,
+  CH = 0x105,
+  DH = 0x106,
+
+  AX = 0,
+  BX = 3,
+  CX = 1,
+  DX = 2,
+  SI = 6,
+  DI = 7,
+  BP = 5,
+  SP = 4,
+
+  XMM0 = 0,
+  XMM1,
+  XMM2,
+  XMM3,
+  XMM4,
+  XMM5,
+  XMM6,
+  XMM7,
+  XMM8,
+  XMM9,
+  XMM10,
+  XMM11,
+  XMM12,
+  XMM13,
+  XMM14,
+  XMM15,
+
+  YMM0 = 0,
+  YMM1,
+  YMM2,
+  YMM3,
+  YMM4,
+  YMM5,
+  YMM6,
+  YMM7,
+  YMM8,
+  YMM9,
+  YMM10,
+  YMM11,
+  YMM12,
+  YMM13,
+  YMM14,
+  YMM15,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+}  // namespace Gen